#!/usr/bin/perl -w # FASTA_MODULE # VERSION: Version 2 (1 Dec 2003) # PURPOSE: Reads single or multiple FastA files # # Read_FastA_sequences (path) # Opens file # Reads one or more contiguous sequences in FastA format, along with header # Saves header and (upper case) sequence # Used in conjunction with Get_sequence_info # Closes file # Returns number of sequences read # # Get_sequence_info (number of sequence) # Returns the header and sequence of the n'th sequence stored # by Read_FastA_sequence # Returns undefined if sequence doesn't exist, perhaps because # Read_FastA_sequence hasn't been successfully invoked # # Read_FastA_sequence (path) # Opens file # Reads first (or only) header and sequence in FastA format # Closes file # Returns header and (upper case) sequence # Read_next_FastA_sequence (path) # Opens file if not already open # Reads next unread FastA sequence # Returns header and (upper case) sequence # More space-efficient than Read_FastA_sequences package FastA_module; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(Read_FastA_sequences Get_sequence_info Read_FastA_sequence Read_next_FastA_sequence); ############## LIBRARIES AND PRAGMAS ################ use strict; #################### CONSTANTS ###################### my $true = 1; my $LF = "\n"; #################### VARIABLES ###################### my @seq_info; # Stores all headers and sequences: # 2D array: $seq_info[n,0] = nth header # $seq_info[n,1] = nth sequence my %file_handle; # File handles for open files my %next_header; # Keeps track of next FastA header for each open file ################# INITIALIZATION #################### return $true; #################### SUBROUTINES #################### ##### READ_FASTA_SEQUENCES (path) # Reads file consisting of one or more FastA-formatted sequences # Saves an array, @seq_info, containing header and sequence # Returns number of sequences read sub Read_FastA_sequences { my ($file_path) = @_; open SEQUENCES, "<$file_path" or die "Can't open $file_path: $!\n"; my $header; # Current header my $sequence; # Current sequence my $line = ; while (defined $line) { chomp $line; if (substr($line,0,1) eq ">") { # Found header if (defined $header) { # If not first sequence push @seq_info, [$header, $sequence]; # Save current sequence } $header = substr($line,1); # Save header of next sequence $sequence = ""; # Initialize next sequence } else { $sequence .= uc($line); # Append line to sequence } $line = ; } if (defined $header) { # If current sequence exists push @seq_info, [$header, $sequence]; # Save it } close SEQUENCES; return scalar(@seq_info); # Return number of sequences read } ##### GET_SEQUENCE_INFO (n) # Returns header and sequence of nth sequence in @seq_info sub Get_sequence_info { my ($n) = @_; my $header = ${$seq_info[$n-1]}[0]; my $sequence = ${$seq_info[$n-1]}[1]; return $header, $sequence; } #### READ_FASTA_SEQUENCE (path to file) # Opens sequence file in FastA format # Returns header and upper cased sequence sub Read_FastA_sequence { my ($path) = @_; my $line; my $header; my $sequence = ""; open SEQUENCE_FILE, "<$path" or die "Can't open $path: $!\n"; $line = ; # Read first line (must be header) chomp $line; # Remove line break $header = substr($line, 1); # Saves header, discards ">" $line = ; # Read next line while (defined $line) { # While file still has lines... chomp $line; # Remove line break $line =~ s/\r//; # And carriage return (if any) $sequence .= $line; # Append the line to the sequence $line = ; # Read next line } close SEQUENCE_FILE; return ($header, uc($sequence)); } ##### READ_NEXT_FASTA_SEQUENCE (path to file) # Opens file consisting of one or more FastA-formatted sequences # Each successive call reads and returns the next sequence # Returns header and upper cased sequence sub Read_next_FastA_sequence { my ($path) = @_; my $line; my $header; my $sequence = ""; if (not defined $file_handle{$path} ) { # new file $file_handle{$path} = Unique_filehandle($path); $line = readline( $file_handle{$path} ); while (defined $line) { chomp $line; if (substr($line, 0, 1) eq ">") {last} $line = readline( $file_handle{$path} ); } $next_header{$path} = $line; } $header = $next_header{$path}; $next_header{$path} = undef; $line = readline( $file_handle{$path} ); while (defined $line) { chomp $line; # Remove line feed $line =~ s/\r//; # And carriage return (if any) if (substr($line,0,1) eq ">") { # Found header $next_header{$path} = $line; last; } else { # Found part of sequence $sequence .= uc($line); # Append line to sequence } $line = readline( $file_handle{$path} ); # Read next line } return ($header, $sequence); } ##### UNIQUE_FILEHANDLE # Returns anonymous filehandle sub Unique_filehandle { my ($path) = @_; local *FH; open FH, "<$path" or die "Can't open $path: $!\n"; return *FH; }