Take a look at the following steps:
- Let's start by plotting the distribution of variants across the genome in both files:
%matplotlib inlinefrom collections import defaultdictimport functoolsimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltimport vcfdef do_window(recs, size, fun): start = None win_res = [] for rec in recs: if not rec.is_snp or len(rec.ALT) > 1: continue if start is None: start = rec.POS my_win = 1 + (rec.POS - start) // size while len(win_res) < my_win: win_res.append([]) win_res[my_win - 1].extend(fun(rec)) return win_reswins = {}size = 2000names = ['centro.vcf.gz', 'standard.vcf.gz']for name in names: recs = vcf.Reader(filename=name) wins[name] = do_window(recs, size, lambda x: [1]) ...