X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bcftools%2Fvcfutils.pl;h=2b7ba0b1d00932be1f79f08e4b3bd8e8db951295;hp=d0b797143cf56fe0a67383ec4f610096eaecf2c0;hb=2347ad62090e036841008e054c234868e21f7a80;hpb=8d2494d1fb7cd0fa7c63be5ffba8dd1a11457522 diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index d0b7971..2b7ba0b 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -10,15 +10,31 @@ use Getopt::Std; exit; sub main { - my $version = '0.1.0'; &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, - hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats); + hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, + gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } +sub splitchr { + my %opts = (l=>5000000); + getopts('l:', \%opts); + my $l = $opts{l}; + die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] \n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + my @t = split; + my $last = 0; + for (my $i = 0; $i < $t[1];) { + my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l; + print "$t[0]:".($i+1)."-$e\n"; + $i = $e; + } + } +} + sub subsam { die(qq/Usage: vcfutils.pl subsam [samples]\n/) if (@ARGV == 0); my ($fh, %h); @@ -70,7 +86,7 @@ sub fillac { print; } else { my @t = split; - my @c = (0); + my @c = (0, 0); my $n = 0; my $s = -1; @_ = split(":", $t[8]); @@ -155,7 +171,7 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # next if (length($t[3]) != 1 || uc($t[3]) eq 'N'); $t[3] = uc($t[3]); $t[4] = uc($t[4]); my @s = split(',', $t[4]); - $t[5] = 3 if ($t[5] < 0); + $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0); next if (length($s[0]) != 1); my $hit; if ($is_vcf) { @@ -199,44 +215,51 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # } sub varFilter { - my %opts = (d=>1, D=>10000, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, F=>.001); - getopts('pq:d:D:l:Q:w:W:N:G:F:', \%opts); + my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); + getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); die(qq/ Usage: vcfutils.pl varFilter [options] Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] - -q INT minimum RMS mapping quality for gaps [$opts{q}] -d INT minimum read depth [$opts{d}] -D INT maximum read depth [$opts{D}] - - -G INT min indel score for nearby SNP filtering [$opts{G}] + -a INT minimum number of alternate bases [$opts{a}] -w INT SNP within INT bp around a gap to be filtered [$opts{w}] - - -W INT window size for filtering dense SNPs [$opts{W}] - -N INT max number of SNPs in a window [$opts{N}] - - -l INT window size for filtering adjacent gaps [$opts{l}] - + -W INT window size for filtering adjacent gaps [$opts{W}] + -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] + -2 FLOAT min P-value for baseQ bias [$opts{2}] + -3 FLOAT min P-value for mapQ bias [$opts{3}] + -4 FLOAT min P-value for end distance bias [$opts{4}] + -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] -p print filtered variants + +Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. \n/) if (@ARGV == 0 && -t STDIN); # calculate the window size - my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W}); + my ($ol, $ow) = ($opts{W}, $opts{w}); my $max_dist = $ol > $ow? $ol : $ow; - $max_dist = $oW if ($max_dist < $oW); # the core loop - my @staging; # (indel_filtering_score, flt_tag) + my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...) while (<>) { my @t = split; - next if (/^#/); + if (/^#/) { + print; next; + } next if ($t[4] eq '.'); # skip non-var sites - my $is_snp = 1; + next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') + # check if the site is a SNP + my $type = 1; # SNP if (length($t[3]) > 1) { - $is_snp = 0; + $type = 2; # MNP + my @s = split(',', $t[4]); + for (@s) { + $type = 3 if (length != length($t[3])); + } } else { my @s = split(',', $t[4]); for (@s) { - $is_snp = 0 if (length > 1); + $type = 3 if (length > 1); } } # clear the out-of-range elements @@ -245,24 +268,18 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much } - my ($flt, $score) = (0, -1); - - # collect key annotations - my ($dp, $mq, $af) = (-1, -1, 1); - if ($t[7] =~ /DP=(\d+)/i) { - $dp = $1; - } elsif ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) { + my $flt = 0; + # parse annotations + my ($dp, $mq, $dp_alt) = (-1, -1, -1); + if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) { $dp = $1 + $2 + $3 + $4; + $dp_alt = $3 + $4; } - if ($t[7] =~ /MQ=(\d+)/i) { - $mq = $1; - } - if ($t[7] =~ /AF=([^\s;=]+)/i) { - $af = $1; - } elsif ($t[7] =~ /AF1=([^\s;=]+)/i) { - $af = $1; + if ($t[7] =~ /DP=(\d+)/i) { + $dp = $1; } - # the depth filter + $mq = $1 if ($t[7] =~ /MQ=(\d+)/i); + # the depth and mapQ filter if ($dp >= 0) { if ($dp < $opts{d}) { $flt = 2; @@ -270,58 +287,57 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] $flt = 3; } } + $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a}); + $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); + $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ + && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); + $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); + # HWE filter + if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { + my $p = 2*$1 + $2; + my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; + $flt = 9 if ($f < 0); + } - # site dependent filters - my $dlen = 0; + my $score = $t[5] * 100 + $dp_alt; + my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs if ($flt == 0) { - if (!$is_snp) { # an indel - # If deletion, remember the length of the deletion - $dlen = length($t[3]) - 1; - $flt = 1 if ($mq < $opts{q}); - # filtering SNPs - if ($t[5] >= $opts{G}) { - for my $x (@staging) { - # Is it a SNP and is it outside the SNP filter window? - next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]); - $x->[1] = 5 if ($x->[1] == 0); - } + if ($type == 3) { # an indel + # filtering SNPs and MNPs + for my $x (@staging) { + next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); + $x->[1] = 5; } - # the indel filtering score - $score = $t[5]; # check the staging list for indel filtering for my $x (@staging) { - # Is it a SNP and is it outside the gap filter window - next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]); - if ($x->[0] < $score) { + next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]); + if ($x->[0]>>2 < $score) { $x->[1] = 6; } else { $flt = 6; last; } } - } else { # a SNP - $flt = 1 if ($mq < $opts{Q}); - # check adjacent SNPs - my $k = 1; + } else { # SNP or MNP for my $x (@staging) { - ++$k if ($x->[0] < 0 && -($x->[0] + 1) > $opts{F} && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5)); + next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); + if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) + && length($x->[7]) - length($x->[6]) == 1) { + $x->[1] = 5; + } else { $flt = 5; } + last; } - # filtering is necessary - if ($k > $opts{N}) { - $flt = 4; - for my $x (@staging) { - $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0); - } - } else { # then check gap filter - for my $x (@staging) { - next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]); - if ($x->[0] >= $opts{G}) { - $flt = 5; last; - } + # check MNP + for my $x (@staging) { + next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]); + if ($x->[0]>>2 < $score) { + $x->[1] = 8; + } else { + $flt = 8; last; } } } } - push(@staging, [$score < 0? -$af-1 : $score, $flt, $dlen, @t]); + push(@staging, [$score<<2|$type, $flt, $rlen, @t]); } # output the last few elements in the staging list while (@staging) { @@ -334,47 +350,35 @@ sub varFilter_aux { if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { - print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; + print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } -sub filter4vcf { - my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3); - getopts('d:D:1:2:3:4:Q:q:', \%opts); - die(qq/ -Usage: vcfutils.pl filter4vcf [options] - -Options: -d INT min total depth (given DP or DP4) [$opts{d}] - -D INT max total depth [$opts{D}] - -q INT min SNP quality [$opts{q}] - -Q INT min RMS mapQ (given MQ) [$opts{Q}] - -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] - -2 FLOAT min P-value for baseQ bias [$opts{2}] - -3 FLOAT min P-value for mapQ bias [$opts{3}] - -4 FLOAT min P-value for end distance bias [$opts{4}]\n -/) if (@ARGV == 0 && -t STDIN); - - my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); - - my @n = (0, 0); +sub gapstats { + my (@c0, @c1); + $c0[$_] = $c1[$_] = 0 for (0 .. 10000); while (<>) { - if (/^#/) { - print; - next; - } - next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); - my $depth = -1; - $depth = $1 if (/DP=(\d+)/); - $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/); - next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D})); - next if (/MQ=(\d+)/ && $1 < $opts{Q}); + next if (/^#/); my @t = split; - next if ($t[5] >= 0 && $t[5] < $opts{q}); - ++$n[0]; + next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel my @s = split(',', $t[4]); - ++$n[1] if ($ts{$t[3].$s[0]}); - print; + for my $x (@s) { + my $l = length($x) - length($t[3]) + 5000; + if ($x =~ /^-/) { + $l = -(length($x) - 1) + 5000; + } elsif ($x =~ /^\+/) { + $l = length($x) - 1 + 5000; + } + $c0[$l] += 1 / @s; + } + } + for (my $i = 0; $i < 10000; ++$i) { + next if ($c0[$i] == 0); + $c1[0] += $c0[$i]; + $c1[1] += $c0[$i] if (($i-5000)%3 == 0); + printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]); } + printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]); } sub ucscsnp2vcf { @@ -462,6 +466,87 @@ sub hapmap2vcf { } } +sub vcf2fq { + my %opts = (d=>3, D=>100000, Q=>10, l=>5); + getopts('d:D:Q:l:', \%opts); + die(qq/ +Usage: vcfutils.pl vcf2fq [options] + +Options: -d INT minimum depth [$opts{d}] + -D INT maximum depth [$opts{D}] + -Q INT min RMS mapQ [$opts{Q}] + -l INT INDEL filtering window [$opts{l}] +\n/) if (@ARGV == 0 && -t STDIN); + + my ($last_chr, $seq, $qual, $last_pos, @gaps); + my $_Q = $opts{Q}; + my $_d = $opts{d}; + my $_D = $opts{D}; + + my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', + GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); + + $last_chr = ''; + while (<>) { + next if (/^#/); + my @t = split; + if ($last_chr ne $t[0]) { + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); + ($last_chr, $last_pos) = ($t[0], 0); + $seq = $qual = ''; + @gaps = (); + } + die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); + if ($t[1] - $last_pos > 1) { + $seq .= 'n' x ($t[1] - $last_pos - 1); + $qual .= '!' x ($t[1] - $last_pos - 1); + } + if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference + my ($ref, $alt) = ($t[3], $1); + my ($b, $q); + $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); + if ($q < 0) { + $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; + $b = ($_ < .5 || $alt eq '.')? $ref : $alt; + $q = -$q; + } else { + $b = $het{"$ref$alt"}; + $b ||= 'N'; + } + $b = lc($b); + $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); + $q = int($q + 33 + .499); + $q = chr($q <= 126? $q : 126); + $seq .= $b; + $qual .= $q; + } elsif ($t[4] ne '.') { # an INDEL + push(@gaps, [$t[1], length($t[3])]); + } + $last_pos = $t[1]; + } + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); +} + +sub v2q_post_process { + my ($chr, $seq, $qual, $gaps, $l) = @_; + for my $g (@$gaps) { + my $beg = $g->[0] > $l? $g->[0] - $l : 0; + my $end = $g->[0] + $g->[1] + $l; + $end = length($$seq) if ($end > length($$seq)); + substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); + } + print "\@$chr\n"; &v2q_print_str($seq); + print "+\n"; &v2q_print_str($qual); +} + +sub v2q_print_str { + my ($s) = @_; + my $l = length($$s); + for (my $i = 0; $i < $l; $i += 60) { + print substr($$s, $i, 60), "\n"; + } +} + sub usage { die(qq/ Usage: vcfutils.pl []\n @@ -469,9 +554,14 @@ Command: subsam get a subset of samples listsam list the samples fillac fill the allele count field qstats SNP stats stratified by QUAL - varFilter filtering short variants - filter4vcf filtering VCFs produced by samtools+bcftools + hapmap2vcf convert the hapmap format to VCF ucscsnp2vcf convert UCSC SNP SQL dump to VCF + + varFilter filtering short variants (*) + vcf2fq VCF->fastq (**) + +Notes: Commands with description endting with (*) may need bcftools + specific annotations. \n/); }