Converting alignment files

From BioPerl
Jump to: navigation, search

This script will convert an alignment file to another alignment file format. I use it to convert Mauve xmfa output to a directory of fasta alignments, but it is useful for converting fasta to clw or other formats.

Sorry if someone already wrote this kind of script... I couldn't find one like it right away, so I made it and thought I should contribute. --Lskatz 18:24, 21 February 2012 (UTC)

I updated my script. There are two major things in this that I think I updated over time which is error reporting and also the ability to concatenate an entire xmfa into a fasta alignment. Some people in my lab wanted to view a MAUVE alignment in MEGA, which prompted this. --Lskatz 20:10, 3 August 2012 (UTC)

#!/usr/bin/env perl
 
# Converts an alignment to another alignment format
# Run with no arguments or with -h for help.
# author: Lee Katz <lkatz@cdc.gov>
 
use Bio::AlignIO;
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use List::Util qw/max/;
 
sub logmsg{print "@_\n";};
exit(main());
 
sub main{
  my $settings={};
  die usage() if(@ARGV<1);
  GetOptions($settings,qw(infile=s ginformat=s outfile=s format=s concatenateAlignment linker=s));
  die usage() if($$settings{help});
 
  my $infile=$$settings{infile} or die "Error: Need infile param:\n".usage();
  my $outfile=$$settings{outfile} or die "Error: Need outfile param:\n".usage();
  $$settings{outfileformat}=$$settings{format} || "fasta";
  $$settings{linker}||="";
 
  convertAln($infile,$outfile,$settings);
 
  logmsg "Output is now in $outfile";
  return 0;
}
 
sub convertAln{
  my($infile,$outfile,$settings)=@_;
  $$settings{infileFormat}=$$settings{ginformat}||guess_format($infile,$settings);
  my $numAlns;
  if($outfile=~/\/$/){
    $numAlns=convertAlnToDir($infile,$outfile,$settings);
  } else {
    $numAlns=convertAlnToFile($infile,$outfile,$settings);
  }
  logmsg "Finished converting $infile to $outfile ($numAlns alignments)";
  return $numAlns;
}
 
sub convertAlnToDir{
  my($infile,$outdir,$settings)=@_;
  $outdir=~s/\/+$//; # remove the trailing slash
  logmsg "Converting $infile ($$settings{infileFormat}) to a directory $outdir ($$settings{outfileformat})";
  mkdir($outdir) if(!-d $outdir);
  my $in=Bio::AlignIO->new(-file=>$infile,-format=>$$settings{infileFormat});
  my $alnCounter=0;
  my $outExtension=$$settings{outfileformat};
  while(my $aln=$in->next_aln){
    my $out=Bio::AlignIO->new(-file=>">$outdir/".++$alnCounter.$outExtension,-format=>$$settings{outfileformat},-flush=>0);
    $out->write_aln($aln);
  }
  return $alnCounter;
}
 
sub convertAlnToFile{
  my($infile,$outfile,$settings)=@_;
  logmsg "Converting $infile ($$settings{infileFormat}) to file $outfile";
  my $in=Bio::AlignIO->new(-file=>$infile,-format=>$$settings{infileFormat});
  my $out=Bio::AlignIO->new(-file=>">$outfile",-format=>$$settings{outfileformat},-flush=>0);
  my $alnCounter=0;
  if($$settings{concatenateAlignment}){
    my @expectedId=findAllUniqueIdsInAln($infile,$settings);
    my %alnSequence;
    while(my $aln=$in->next_aln){
      # concatenate sequences
      foreach my $seq($aln->each_seq){
        $alnSequence{$seq->id}.=$seq->seq.$$settings{linker};
      }
 
      # keep the alignment flush, even if I have to add gaps
      my $currentAlnLength=max(map(length($_),values(%alnSequence)));
      for(@expectedId){
        my $lackOfSequence=$currentAlnLength-length($alnSequence{$_});
        $alnSequence{$_}.='-'x$lackOfSequence;
      }
    }
 
    # write the complete alignment to a file
    my $concatAln=Bio::SimpleAlign->new();
    for(@expectedId){
      my $stringfh;
      open($stringfh, "<", \$alnSequence{$_}) or die "Could not open string for reading: $!";
      my $seq=new Bio::LocatableSeq(-seq=>$alnSequence{$_},-id=>$_); # must be locatableSeq to be added
      $concatAln->add_seq($seq);
    }
    $out->write_aln($concatAln);
  } else { # it's pretty simple to just do a conversion to a different file
    while(my $aln=$in->next_aln){
      $out->write_aln($aln);
      $alnCounter++;
    }
  }
  return $alnCounter;
}
 
sub findAllUniqueIdsInAln{
  my($infile,$settings)=@_;
  logmsg "Finding all expected sequence IDs";
  my %genome;
  my $alnin=Bio::AlignIO->new(-file=>$infile,-format=>$$settings{infileFormat});
  while(my $aln=$alnin->next_aln){
    foreach my $seq($aln->each_seq){
      $genome{$seq->id}++;
    }
  }
  return keys(%genome);
}
 
sub guess_format{
  my ($filename,$settings)=@_;
  for my $format (qw(pfam xmfa selex fasta stockholm prodom clustalw msf mase bl2seq nexus phylip)){
    eval{
      my $in=Bio::AlignIO->new(-file=>$filename,-format=>$format);
      my $aln=$in->next_aln;
      $aln->length;
    };
    if($@){
      my $error=$@;
      next;
    }
    return $format;
  }
  die "Could not guess the format of $filename\n";
}
 
sub usage{
  "Converts an alignment to another alignment format.
  $0 -i alignmentfile -o outputPrefix [-f outputformat]
  -i the alignment file to input
  -o the output alignment file or directory
    Specify a directory by a trailing slash.
    If a directory, the output format will be determined by -f. Default: fasta.
  -f output format
    possible values are derived from bioperl.
    e.g. fasta, clustalw, phylip, xmfa
  -g input format to force the correct input format, but it's ok to let it guess
  -c to concatenate the alignment into one solid entry
  -l linkerSequence, to be used with -c, to separate entries.
    Default is no linker but a useful linker could be NNNNNNNNNN
  -h this helpful menu
  ";
}
Personal tools
Namespaces
Variants
Actions
Main Links
documentation
community
development
Toolbox