13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
17 gapstats=>\&gapstats);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
23 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
25 my $fn = shift(@ARGV);
27 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
28 $h{$_} = 1 for (@ARGV);
34 my @s = @t[0..8]; # all fixed fields + FORMAT
41 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
42 print join("\t", @s), "\n";
46 print join("\t", @t[0..7]), "\n";
48 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
56 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
60 print join("\n", @t[9..$#t]), "\n";
67 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
76 @_ = split(":", $t[8]);
78 if ($_[$_] eq 'GT') { $s = $_; last; }
81 print join("\t", @t), "\n";
85 if ($t[$_] =~ /^0,0,0/) {
86 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
91 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
93 $info =~ s/(;?)AC=(\d+)//;
94 $info =~ s/(;?)AN=(\d+)//;
101 print join("\t", @t), "\n";
108 getopts('t:', \%opts);
109 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
110 my $cutoff = $opts{t};
111 my ($last, $lastchr) = (0x7fffffff, '');
112 my ($x, $y, $n) = (0, 0, 0);
114 if (/^([^#\s]+)\s(\d+)/) {
115 my ($chr, $pos) = ($1, $2);
116 if (/NEIR=([\d\.]+)/) {
118 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
120 $last = $pos; $lastchr = $chr;
123 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
124 print "Fraction: ", $y/$n, "\n";
125 print "Length: $x\n";
129 my %opts = (r=>'', s=>0.02, v=>undef);
130 getopts('r:s:v', \%opts);
131 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
132 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
133 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
135 my $is_vcf = defined($opts{v})? 1 : 0;
136 if ($opts{r}) { # read the reference positions
138 open($fh, $opts{r}) || die;
143 $h{$t[0],$t[1]} = $t[4];
145 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
150 my $hsize = scalar(keys %h);
155 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
156 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
157 my @s = split(',', $t[4]);
158 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
159 next if (length($s[0]) != 1);
163 my $aa = $h{$t[0],$t[1]};
165 my @aaa = split(",", $aa);
167 $hit = 1 if ($_ eq $s[0]);
171 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
173 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
175 push(@a, [-1, 0, 0, 0]); # end marker
176 die("[qstats] No SNP data!\n") if (@a == 0);
177 @a = sort {$b->[0]<=>$a->[0]} @a;
180 my @c = (0, 0, 0, 0);
184 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
186 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
187 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
188 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
189 my $a = $c[1] - $lc[1];
190 my $b = $c[2] - $lc[2];
191 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
192 print join("\t", $last, @c, @x), "\n";
193 $next = $c[0]/@a + $opts{s};
194 $lc[1] = $c[1]; $lc[2] = $c[2];
196 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
202 my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
203 getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
205 Usage: vcfutils.pl varFilter [options] <in.vcf>
207 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
208 -d INT minimum read depth [$opts{d}]
209 -D INT maximum read depth [$opts{D}]
210 -a INT minimum number of alternate bases [$opts{a}]
211 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
212 -W INT window size for filtering adjacent gaps [$opts{W}]
213 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
214 -2 FLOAT min P-value for baseQ bias [$opts{2}]
215 -3 FLOAT min P-value for mapQ bias [$opts{3}]
216 -4 FLOAT min P-value for end distance bias [$opts{4}]
217 -p print filtered variants
219 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
220 \n/) if (@ARGV == 0 && -t STDIN);
222 # calculate the window size
223 my ($ol, $ow) = ($opts{W}, $opts{w});
224 my $max_dist = $ol > $ow? $ol : $ow;
226 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
232 next if ($t[4] eq '.'); # skip non-var sites
233 # check if the site is a SNP
235 if (length($t[3]) > 1) {
238 my @s = split(',', $t[4]);
240 $is_snp = 0 if (length > 1);
243 # clear the out-of-range elements
245 # Still on the same chromosome and the first element's window still affects this position?
246 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
247 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
251 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
252 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
253 $dp = $1 + $2 + $3 + $4;
256 if ($t[7] =~ /DP=(\d+)/i) {
259 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
260 # the depth and mapQ filter
262 if ($dp < $opts{d}) {
264 } elsif ($dp > $opts{D}) {
268 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
269 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
270 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
271 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
273 # site dependent filters
274 my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
276 if (!$is_snp) { # an indel
277 $rlen = length($t[3]) - 1;
278 $indel_score = $t[5] * 100 + $dp_alt;
280 for my $x (@staging) {
281 next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
284 # check the staging list for indel filtering
285 for my $x (@staging) {
286 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
287 if ($x->[0] < $indel_score) {
294 for my $x (@staging) {
295 next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
301 push(@staging, [$indel_score, $flt, $rlen, @t]);
303 # output the last few elements in the staging list
305 varFilter_aux(shift @staging, $opts{p});
310 my ($first, $is_print) = @_;
311 if ($first->[1] == 0) {
312 print join("\t", @$first[3 .. @$first-1]), "\n";
313 } elsif ($is_print) {
314 print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
320 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
324 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
325 my @s = split(',', $t[4]);
327 my $l = length($x) - length($t[3]) + 5000;
329 $l = -(length($x) - 1) + 5000;
330 } elsif ($x =~ /^\+/) {
331 $l = length($x) - 1 + 5000;
336 for (my $i = 0; $i < 10000; ++$i) {
337 next if ($c0[$i] == 0);
339 $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
340 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
342 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
346 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
347 print "##fileformat=VCFv4.0\n";
348 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
351 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
356 $t[9] = reverse($t[9]);
357 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
359 my @a = split("/", $t[9]);
361 push(@alt, $_) if ($_ ne $alt[0]);
367 $alt[$_] = "N$alt[$_]";
370 my $ref = shift(@alt);
371 my $af = $t[13] > 0? ";AF=$t[13]" : '';
372 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
373 my $info = "molType=$t[10];class=$t[11]$valid$af";
374 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
379 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
380 my $fn = shift(@ARGV);
382 warn("Parsing UCSC SNPs...\n");
384 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
387 next if ($t[3] - $t[2] != 1); # not SNP
388 @{$map{$t[4]}} = @t[1,3,7];
392 warn("Writing VCF...\n");
393 print "##fileformat=VCFv4.0\n";
396 if ($t[0] eq 'rs#') { # the first line
397 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
399 next unless ($map{$t[0]});
400 next if (length($t[1]) != 3); # skip non-SNPs
401 my $a = \@{$map{$t[0]}};
403 my @u = split('/', $t[1]);
405 $u[1] = $u[0]; $u[0] = $ref;
406 } elsif ($u[0] ne $ref) { next; }
409 $w{$u[0]} = 0; $w{$u[1]} = 1;
410 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
416 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
417 if (!defined($a[0]) || !defined($a[1])) {
421 push(@s, "$a[0]/$a[1]");
425 print join("\t", @s), "\n";
432 Usage: vcfutils.pl <command> [<arguments>]\n
433 Command: subsam get a subset of samples
434 listsam list the samples
435 fillac fill the allele count field
436 qstats SNP stats stratified by QUAL
437 varFilter filtering short variants
438 hapmap2vcf convert the hapmap format to VCF
439 ucscsnp2vcf convert UCSC SNP SQL dump to VCF