X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bcftools%2Fvcfutils.pl;h=2b7ba0b1d00932be1f79f08e4b3bd8e8db951295;hp=6cc168fb363f59541511429b6e6538e0263acbe1;hb=2347ad62090e036841008e054c234868e21f7a80;hpb=6a0c6f060a60789b48e10a72b1381f6e54599302 diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index 6cc168f..2b7ba0b 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -14,11 +14,27 @@ sub main { my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, - gapstats=>\&gapstats); + gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } +sub splitchr { + my %opts = (l=>5000000); + getopts('l:', \%opts); + my $l = $opts{l}; + die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] \n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + my @t = split; + my $last = 0; + for (my $i = 0; $i < $t[1];) { + my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l; + print "$t[0]:".($i+1)."-$e\n"; + $i = $e; + } + } +} + sub subsam { die(qq/Usage: vcfutils.pl subsam [samples]\n/) if (@ARGV == 0); my ($fh, %h); @@ -70,7 +86,7 @@ sub fillac { print; } else { my @t = split; - my @c = (0); + my @c = (0, 0); my $n = 0; my $s = -1; @_ = split(":", $t[8]); @@ -199,8 +215,8 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # } sub varFilter { - my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4); - getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts); + my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); + getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); die(qq/ Usage: vcfutils.pl varFilter [options] @@ -214,6 +230,7 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] -2 FLOAT min P-value for baseQ bias [$opts{2}] -3 FLOAT min P-value for mapQ bias [$opts{3}] -4 FLOAT min P-value for end distance bias [$opts{4}] + -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] -p print filtered variants Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. @@ -230,6 +247,7 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. print; next; } next if ($t[4] eq '.'); # skip non-var sites + next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') # check if the site is a SNP my $type = 1; # SNP if (length($t[3]) > 1) { @@ -273,6 +291,13 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); + $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); + # HWE filter + if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { + my $p = 2*$1 + $2; + my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; + $flt = 9 if ($f < 0); + } my $score = $t[5] * 100 + $dp_alt; my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs @@ -295,7 +320,10 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. } else { # SNP or MNP for my $x (@staging) { next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); - $flt = 5; + if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) + && length($x->[7]) - length($x->[6]) == 1) { + $x->[1] = 5; + } else { $flt = 5; } last; } # check MNP @@ -322,7 +350,7 @@ sub varFilter_aux { if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { - print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; + print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } @@ -438,6 +466,87 @@ sub hapmap2vcf { } } +sub vcf2fq { + my %opts = (d=>3, D=>100000, Q=>10, l=>5); + getopts('d:D:Q:l:', \%opts); + die(qq/ +Usage: vcfutils.pl vcf2fq [options] + +Options: -d INT minimum depth [$opts{d}] + -D INT maximum depth [$opts{D}] + -Q INT min RMS mapQ [$opts{Q}] + -l INT INDEL filtering window [$opts{l}] +\n/) if (@ARGV == 0 && -t STDIN); + + my ($last_chr, $seq, $qual, $last_pos, @gaps); + my $_Q = $opts{Q}; + my $_d = $opts{d}; + my $_D = $opts{D}; + + my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', + GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); + + $last_chr = ''; + while (<>) { + next if (/^#/); + my @t = split; + if ($last_chr ne $t[0]) { + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); + ($last_chr, $last_pos) = ($t[0], 0); + $seq = $qual = ''; + @gaps = (); + } + die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); + if ($t[1] - $last_pos > 1) { + $seq .= 'n' x ($t[1] - $last_pos - 1); + $qual .= '!' x ($t[1] - $last_pos - 1); + } + if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference + my ($ref, $alt) = ($t[3], $1); + my ($b, $q); + $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); + if ($q < 0) { + $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; + $b = ($_ < .5 || $alt eq '.')? $ref : $alt; + $q = -$q; + } else { + $b = $het{"$ref$alt"}; + $b ||= 'N'; + } + $b = lc($b); + $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); + $q = int($q + 33 + .499); + $q = chr($q <= 126? $q : 126); + $seq .= $b; + $qual .= $q; + } elsif ($t[4] ne '.') { # an INDEL + push(@gaps, [$t[1], length($t[3])]); + } + $last_pos = $t[1]; + } + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); +} + +sub v2q_post_process { + my ($chr, $seq, $qual, $gaps, $l) = @_; + for my $g (@$gaps) { + my $beg = $g->[0] > $l? $g->[0] - $l : 0; + my $end = $g->[0] + $g->[1] + $l; + $end = length($$seq) if ($end > length($$seq)); + substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); + } + print "\@$chr\n"; &v2q_print_str($seq); + print "+\n"; &v2q_print_str($qual); +} + +sub v2q_print_str { + my ($s) = @_; + my $l = length($$s); + for (my $i = 0; $i < $l; $i += 60) { + print substr($$s, $i, 60), "\n"; + } +} + sub usage { die(qq/ Usage: vcfutils.pl []\n @@ -445,8 +554,14 @@ Command: subsam get a subset of samples listsam list the samples fillac fill the allele count field qstats SNP stats stratified by QUAL - varFilter filtering short variants + hapmap2vcf convert the hapmap format to VCF ucscsnp2vcf convert UCSC SNP SQL dump to VCF + + varFilter filtering short variants (*) + vcf2fq VCF->fastq (**) + +Notes: Commands with description endting with (*) may need bcftools + specific annotations. \n/); }