Counting k-mers in large sets of large sequences

From BioPerl
Jump to: navigation, search

(This is the fruit of two scraps, Getting all k-mer combinations of residues and Sharing large arrays among multiple threads)

Marco Blanchette contributes the following scripts. Script 1 (countKmer.pl) processes a (possibly very) large sequence file using multiple threads, to return the frequency distribution of k-mers of arbitrary length k. Script 2 (sampleAndCountKmer.pl) does the same thing, if desired, or will randomly sample the sequences (with or without replacement) to return an empirical frequency distribution of k-mers. The user can also specify whether to count just presence/absence of a k-mer in a sequence, or to return a complete frequency distribution.

Script 1 (countKmer.pl)

[back to top]


 #!/usr/local/bin/perl
 # countKmer.pl take a series of fasta files and count all the occurrence of all the
 # possible kmers in each files. It's a multi-threaded scripts that treat multiple files 
 # asynchronously
 
 use strict;
 use warnings;
 use Getopt::Std;
 use threads;
 use Thread::Queue;
 
 our ($opt_k,$opt_m,$opt_p);
 our $SEP="\t";
 
 MAIN:{
   init();
 
   my $q = Thread::Queue->new;
   $q->enqueue(@ARGV);
 
   my $num_workers = @ARGV < $opt_p ? @ARGV : $opt_p; # no need to be wasteful :)
 
   for (1 .. $num_workers) {
     threads->new(\&worker, $q);
   }
 
   $_->join for threads->list;
 }
 
 sub worker {
   my $queue = shift;
   while (my $filename = $queue->dequeue_nb) {
     processFile($filename);
   }
   return(0);
 }
 
 sub processFile {
   my $file = shift;
 
   #generate all possible word for a k-mer (define at run time by $opt_k)
   print STDERR "Generating all the posible sequences $opt_k long\n";
   my $kmer_p = kmer_generator($opt_k);
 
   print STDERR "Counting the number of $opt_k nt long kmers in $file\n";
   loadSeq($file,$kmer_p);
 
   ##print out the hits
   printHits($kmer_p,$file);
 }
 
 sub loadSeq {
   my $file = shift;
   my $kmer_p = shift;
 
   open FH, "$file" || die "Can't open file $file\n";
   my $f=0;
   my $seq;
   while(<FH>){
     if(/^>/){
       countKmer(\$seq,$kmer_p) if $seq;
       $seq='';
       $f=1;
     }elsif($f==1){
       chomp;
       next if "";
       $seq = join("",$seq,$_);
     }
   }
   countKmer(\$seq,$kmer_p) if $seq; # do not forget the last sequence
   close FH;
   return(0);
 }
 
 sub countKmer {
   my $seq_p = shift;
   my $kmer_p = shift;
   my $k = $opt_k;
   my %beenThere;
 
   for (my $i=0;$i <= length(${$seq_p})-$k;$i++){
     my $w = substr(${$seq_p},$i,$k);
     unless ($opt_m){
       #Count only one occurrence of a kmer per sequence
       $kmer_p->{$w}++ if !exists $beenThere{$w} && exists $kmer_p->{$w};
       $beenThere{$w}=1;
     }else{
       #Count all instances of a kmer per sequence
       $kmer_p->{$w}++ if exists $kmer_p->{$w};
     }
   }
   return(0);
 }
 
 sub printHits {
   my $kmer_p=shift;
   my $file = shift;
 
   ##print out the hits
   my ($dir,$pre,$suf) = ($file =~ /(^.+\/|^)(.+)\.(.+$)/);
   open OUT, ">$pre.hits" || die "Can't create file $pre.hits\n";
   print OUT join($SEP, $_, $kmer_p->{$_}), "\n" for sort keys %{$kmer_p};
   close OUT;
 
   return(0);
 }
 
 
 sub kmer_generator {
   my $k = shift;
   my $kmer_p;
 
   my @bases = ('A','C','G','T');
   my @words = @bases;
 
   for (my $i=1;$i<$k;$i++){
     my @newwords;
     foreach my $w (@words){
       foreach my $b (@bases){
 	push (@newwords,$w.$b);
       }
     }
     undef @words;
     @words = @newwords;
   }
   map{ $kmer_p->{$_}=0} @words;
   return($kmer_p);
 }
 
 sub init {
   getopts("p:k:m");
   unless (@ARGV){
     print("\nUsage: countKmer.pl [-k 6 -p 4 -m] sequence_1.fa [sequence_2.fa ...]\n",
 	  "\tFor each possible words in the kmer of lenght -k,\n",
 	  "\tcount the number of time they are found in the fasta sequence file\n",
 	  "\t-k\tsize of the kmer to analyze. Default 6\n",
 	  "\t-m\twill count all possible kmer per sequences.\n",
 	  "\t\tDefault: only one kmer is counted per sequence entries\n",
 	  "\t-p\tThe number of jobs to process simultaneoulsy. Normally, the number of available processors\n",
 	  "\t\tDefault: 4\n",
 	  "\n",
 	  );
     exit(1)
   }
   $opt_k=6 unless $opt_k;
   $opt_p=4 unless $opt_p;
   return(0);
 }

Script 2 (sampleAndCountKmer.pl)

[back to top]


 #!/usr/local/bin/perl
 # sampleAndCountKmer.pl is a bootstrapping procedure that sample a list of sequences a 
 # certain number of time and return the count of each kmer in each sampling. Again this 
 # is a mutli-threaded scripts that will count multiple samples asynchronously
 
 use strict;
 use warnings;
 use Getopt::Std;
 use threads;
 use threads::shared;
 use Thread::Queue;
 
 our ($opt_f,$opt_k,$opt_m,$opt_p,$opt_s,$opt_n,$opt_w,$opt_v);
 our $SEP="\t";
 
 MAIN:{
   init();
   my $file = $opt_f;
   my $seq_p = loadSeq($opt_f);
 
   #create the final result hash with shared arrays at each kmers.
   my $res = &share({});
   $res = kmer_generator($opt_k);
   $res->{$_} = &share([]) for keys %{$res};
 
   my $q = Thread::Queue->new;
   $q->enqueue(1..$opt_n);
 
   my $num_workers = $opt_p; # no need to be wasteful :)
 
   for (1 .. $num_workers) {
     threads->new(\&worker, $q, $seq_p, $res, $_);
   }
 
   for (threads->list){
     $_->join;
   }
   ##Now, need to deconvolute the arrays containing all the counts for all kmers
   ##and spit out a table of count for all the samples
 
   print(join ("$SEP",$_,@{$res->{$_}}),"\n") for keys %$res;
   print STDERR "all done!\n" if $opt_v;
 }
 
 sub worker {
   my $queue = shift;
   my $seq_p = shift;
   my $res = shift;
   my $t_id = shift;
 
   while (my $q_id = $queue->dequeue_nb) {
     processSeq($q_id,$seq_p,$res);
   }
   return(0);
 }
 
 sub processSeq {
   my $q_id = shift;
   my $seq_p = shift;
   my $res = shift;
 
   #generate all possible word for a k-mer (define at run time by $opt_k)
   print STDERR "Generating all the posible sequences $opt_k nucleotid long kmer for sample $q_id\n" if $opt_v;
   my $kmer_p = kmer_generator($opt_k);
 
   #randomly sample the sequences
   my $sample = sampleSeq($seq_p,$q_id);
 
   #counting the number of kmers in the sample seq
   print STDERR "Counting the number of $opt_k nt long kmers in the $q_id sample\n" if $opt_v;
   map{  countKmer($_,$kmer_p) } @{$sample};
 
   #put the hits in the final result kmer;
   map { push @{$res->{$_}}, $kmer_p->{$_} } keys %{$res};
   return(0);
 }
 
 sub sampleSeq {
   my $seq_p = shift;
   my $q_id = shift;
   my $sample_size = $opt_s;
 
   #create a list of $sample_size items of randomly selected position in @seqs 
   my @lines;
   unless ($opt_w){
     ##Sampling with replacement
     while (scalar(@lines) < $sample_size){
       push @lines, int(rand(@{$seq_p}));
     }
   }else{
     ##Sampling without replacement
     my %lines;
     while (scalar(keys %lines) < $sample_size){
       $lines{int(rand(@{$seq_p}))}='';
     }
     @lines = keys %lines;
   }
 
   print STDERR "Sampling $sample_size items for the sample $q_id\n" if $opt_v;
   my @sample =  map { $seq_p->[$_] } @lines;
 
   return(\@sample);
 }
 
 sub loadSeq {
   my $file = shift;
   my $seq_p;
 
   open FH, "$file" || die "Can't open file $file\n";
   my $f=0;
   my $seq;
 
   while(<FH>){
     if(/^>/){
       push @{$seq_p}, $seq if $seq;
       $seq='';
       $f=1;
     }elsif($f==1){
       chomp;
       next if "";
       $seq = join("",$seq,$_);
     }
   }
   close FH;
  return($seq_p);
 }
 
 sub countKmer {
   my $seq = shift;
   my $kmer_p = shift;
   my $k = $opt_k;
 
   my %beenThere;
   for (my $i=0;$i <= length($seq)-$k;$i++){
     my $w = substr($seq,$i,$k);
     unless ($opt_m){
       #Count only one occurrence of the kmer per sequence
       $kmer_p->{$w}++ if !exists $beenThere{$w} && exists $kmer_p->{$w};
       $beenThere{$w}=1;
     }else{
       #Count all instances of each kmers in a sequence
       $kmer_p->{$w}++ if exists $kmer_p->{$w};
     }
   }
   return(0);
 }
 
 sub printHits {
   my $kmer_p=shift;
   my $file = shift;
 
   ##print out the hits
   my ($dir,$pre,$suf) = ($file =~ /(^.+\/|^)(.+)\.(.+$)/);
   open OUT, ">$pre.hits" || die "Can't create file $pre.hits\n";
   print OUT join($SEP, $_, $kmer_p->{$_}), "\n" for keys %{$kmer_p};
   close OUT;
 
   return(0);
 }
 
 
 sub kmer_generator {
   my $k = shift;
   my $q_id=shift;
   my $kmer_p;
 
   my @bases = ('A','C','G','T');
   my @words = @bases;
 
   for (my $i=1;$i<$k;$i++){
     my @newwords;
     foreach my $w (@words){
       foreach my $b (@bases){
 	push (@newwords,$w.$b);
       }
     }
     undef @words;
     @words = @newwords;
   }
   map{ $kmer_p->{$_}=0} @words;
   return($kmer_p);
 }
 
 sub init {
   getopts("f:p:k:s:n:mwv");
   unless ($opt_f){
    print("\nUsage:\tsampleAndCountKmer.pl [-k 6 -p 4 -s 100000 -n 1000 -m -w] -f sequence.fa \n",
	  "\twill return a table of counts for each k-mers of length -k\n",
	  "\t-f\tFasta database. Required\n",
	  "\t-k\tsize of the k-mer to analyze. Default 6\n",
	  "\t-m\twill count all possible k-mer per sequence.\n",
	  "\t\tDefault: only one k-mer is counted per sequence entries\n",
	  "\t-s number of sequences to sample. Default: 100,000 sequences\n",
	  "\t-n number of times to repeat the sampling/counting procedures. Default: 1000 times\n",
	  "\t-w Sample without replacment: Default: with replacement\n",
	  "\t-p\tThe number of jobs to process simultaneoulsy. Normally, the number of available processors\n",
	  "\t\tDefault: 4\n",
	  "\t-v Run in verbrose mode. Spits out a lot of infomation. Default: quiet\n",
	  "\n",
	  );
     exit(1)
   }
   $opt_k=6 unless $opt_k;
   $opt_p=4 unless $opt_p;
   $opt_s=100000 unless $opt_s;
   $opt_n=1000 unless $opt_n;
   return(0);
 }
[back to top]


Personal tools
Namespaces
Variants
Actions
Main Links
documentation
community
development
Toolbox