use strict; use warnings; use Getopt::Long; use Bio::SeqIO; ### use perl /home/xiangyang/Chen_genome/perl_bin/extract_protein_dir.pl /home/xiangyang/Chen_genome/chen_gbk my $path=$ARGV[0]; # my $filename=getcwd $path; opendir DIR, $path or die $!; my @dir = readdir DIR; closedir DIR; foreach my $file(@dir){ my $input="$path/$file"; mkdir "$path/protein/"; my $output="$path/protein/$file.pep"; open(OUTPUT, ">$output"); ####################################################################################### my %options = ( # input => undef, # output => undef, cdsCount => 0, dna => 'F' ); GetOptions( 'i=s' => \$options{'input'}, 'dna=s' => \$options{'dna'} ); ####################################################################################### my $in = new Bio::SeqIO( -file => $input, -format => 'genbank' ); my $out = new Bio::SeqIO( -file => $output , -format => 'fasta' ); while ( my $seqObject = $in->next_seq() ) { my @features = $seqObject->get_SeqFeatures(); #--------------------------------------------------------------------------------------------display organism name for each CDS my $org = ""; foreach (@features) { my $feat = $_; my @ar; if ($feat->primary_tag() eq "source"){ @ar = $feat->get_tag_values('organism'); $org = $ar[0]; # $org =~ s/\ /_/g; addtion "_" among each word # print STDERR "Found source tag: >$display<\n"; next; } #---------------------------------------------------------------------------------------------xiangyang li addition 2013-01-25 my $length = $seqObject->length(); my $type = lc( $feat->primary_tag ); unless ( $type eq "cds") { next; } my $start = $feat->start; my $stop = $feat->end; my $location = $feat->location; my $locString = $location->to_FTstring; my @loc = split( /,/, $locString ); if ( $loc[0] =~ m/(\d+)\.\.(\d+)/ ) { $start = $1; } if ( $loc[ scalar(@loc) - 1 ] =~ m/(\d+)\.\.(\d+)/ ) { $stop = $2; } my @label = (); if ( $feat->has_tag('locus_tag') ) { push( @label, join( ",", $feat->get_tag_values('locus_tag') ) ); } if ( $feat->has_tag('gene') ) { # push (@label, join(",",$feat->get_tag_values('gene'))); } if ( $feat->has_tag('product') ) { push (@label, join(",",$feat->get_tag_values('product'))); } #------------------------------------------------------------------------------------- push (@label, join(",","[".$file."]")); #-------------------------------------------------------------------------------------xiangyang li addition 2013-01-25 if ( $feat->has_tag('function') ) { # push (@label, join(",",$feat->get_tag_values('function'))); } #add position information to label #label_start=4001;end=5000;strand=1;rf=1; #where strand is 1 or -1 and rf is 1,2, or 3 #start should be smaller than the end # push (@label, "_start=$start;end=$stop" ); my $label = join( ";", @label ); $label =~ s/\s+/_/g; $label =~ s/\n//g; $label =~ s/\t+/ /g; my $trans; if ( !( $feat->has_tag('translation') ) ) { print( "Warning: get_cds.pl was unable to obtain translation for $label. Skipping\n" ); next; } else { my @translation = $feat->get_tag_values('translation'); $trans = $translation[0]; $trans =~ s/[^A-Z]//ig; } # my $dna = $feat->spliced_seq->seq; # if ( ( !( defined($dna) ) ) && ( $options{'dna'} =~ m/t/i ) ) { # print( # "Warning: get_cds.pl was unable to obtain the DNA coding sequence for $label. Skipping\n" # ); # next; # } # $dna =~ s/[^A-Z]//ig; $options{cdsCount}++; # if ( $options{'dna'} =~ m/t/i ) { # print( OUTPUT "$label\n$dna\n\n" ); # } # else { print OUTPUT ">$label\n$trans\n"; # } # close(OUTPUT) or die("Cannot close file : $!"); } } print "A total of $options{cdsCount} records were written to $file.\n"; }