Converting alignment files

From BioPerl
Jump to: navigation, search

This script will convert an alignment file to another alignment file format. I use it to convert Mauve xmfa output to a directory of fasta alignments, but it is useful for converting fasta to clw or other formats.

Sorry if someone already wrote this kind of script... I couldn't find one like it right away, so I made it and thought I should contribute. --Lskatz 18:24, 21 February 2012 (UTC)

I updated my script. There are two major things in this that I think I updated over time which is error reporting and also the ability to concatenate an entire xmfa into a fasta alignment. Some people in my lab wanted to view a MAUVE alignment in MEGA, which prompted this. --Lskatz 20:10, 3 August 2012 (UTC)

#!/usr/bin/env perl
# Converts an alignment to another alignment format
# Run with no arguments or with -h for help.
# author: Lee Katz <>
use Bio::AlignIO;
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use List::Util qw/max/;
sub logmsg{print "@_\n";};
sub main{
  my $settings={};
  die usage() if(@ARGV<1);
  GetOptions($settings,qw(infile=s ginformat=s outfile=s format=s concatenateAlignment linker=s));
  die usage() if($$settings{help});
  my $infile=$$settings{infile} or die "Error: Need infile param:\n".usage();
  my $outfile=$$settings{outfile} or die "Error: Need outfile param:\n".usage();
  $$settings{outfileformat}=$$settings{format} || "fasta";
  logmsg "Output is now in $outfile";
  return 0;
sub convertAln{
  my $numAlns;
  } else {
  logmsg "Finished converting $infile to $outfile ($numAlns alignments)";
  return $numAlns;
sub convertAlnToDir{
  $outdir=~s/\/+$//; # remove the trailing slash
  logmsg "Converting $infile ($$settings{infileFormat}) to a directory $outdir ($$settings{outfileformat})";
  mkdir($outdir) if(!-d $outdir);
  my $in=Bio::AlignIO->new(-file=>$infile,-format=>$$settings{infileFormat});
  my $alnCounter=0;
  my $outExtension=$$settings{outfileformat};
  while(my $aln=$in->next_aln){
    my $out=Bio::AlignIO->new(-file=>">$outdir/".++$alnCounter.$outExtension,-format=>$$settings{outfileformat});
  return $alnCounter;
sub convertAlnToFile{
  logmsg "Converting $infile ($$settings{infileFormat}) to file $outfile";
  my $in=Bio::AlignIO->new(-file=>$infile,-format=>$$settings{infileFormat});
  my $out=Bio::AlignIO->new(-file=>">$outfile",-format=>$$settings{outfileformat});
  my $alnCounter=0;
    my @expectedId=findAllUniqueIdsInAln($infile,$settings);
    my %alnSequence;
    while(my $aln=$in->next_aln){
      # concatenate sequences
      foreach my $seq($aln->each_seq){
      # keep the alignment flush, even if I have to add gaps
      my $currentAlnLength=max(map(length($_),values(%alnSequence)));
        my $lackOfSequence=$currentAlnLength-length($alnSequence{$_});
    # write the complete alignment to a file
    my $concatAln=Bio::SimpleAlign->new();
      my $stringfh;
      open($stringfh, "<", \$alnSequence{$_}) or die "Could not open string for reading: $!";
      my $seq=new Bio::LocatableSeq(-seq=>$alnSequence{$_},-id=>$_); # must be locatableSeq to be added
  } else { # it's pretty simple to just do a conversion to a different file
    while(my $aln=$in->next_aln){
  return $alnCounter;
sub findAllUniqueIdsInAln{
  logmsg "Finding all expected sequence IDs";
  my %genome;
  my $alnin=Bio::AlignIO->new(-file=>$infile,-format=>$$settings{infileFormat});
  while(my $aln=$alnin->next_aln){
    foreach my $seq($aln->each_seq){
  return keys(%genome);
sub guess_format{
  my ($filename,$settings)=@_;
  for my $format (qw(pfam xmfa selex fasta stockholm prodom clustalw msf mase bl2seq nexus phylip)){
      my $in=Bio::AlignIO->new(-file=>$filename,-format=>$format);
      my $aln=$in->next_aln;
      my $error=$@;
    return $format;
  die "Could not guess the format of $filename\n";
sub usage{
  "Converts an alignment to another alignment format.
  $0 -i alignmentfile -o outputPrefix [-f outputformat]
  -i the alignment file to input
  -o the output alignment file or directory
    Specify a directory by a trailing slash.
    If a directory, the output format will be determined by -f. Default: fasta.
  -f output format
    possible values are derived from bioperl.
    e.g. fasta, clustalw, phylip, xmfa
  -g input format to force the correct input format, but it's ok to let it guess
  -c to concatenate the alignment into one solid entry
  -l linkerSequence, to be used with -c, to separate entries.
    Default is no linker but a useful linker could be NNNNNNNNNN
  -h this helpful menu
Personal tools
Main Links