package PTyper; use strict; use warnings; use threads; use Bio::SeqIO; use File::Basename qw; use FindBin; use lib "$FindBin::Bin/lib"; #################################################### #################################################### sub batch_genomenucleotide_extract { my ($path, $fasta1, $fasta2, $thread_number)=@_; opendir DIR, $path or die $!; ################################start rename the file in following case my @path_temp = readdir DIR; foreach (@path_temp){ if (/[ =]/) { my $new_name = $_; $new_name =~ s/ /_/g; #repalce the space with "_" for all GenBank files $new_name =~ s/=/_/g; #repalce "=" with "_" for all GenBank files $new_name =~ s/_+/_/g; system ("mv '$path/$_' '$path/$new_name'"); } } closedir DIR; #################################end rename the file in following case opendir DIR, $path or die $!; my @dir = readdir DIR; @dir = grep ($_!~/^\./, @dir); closedir DIR; my @input; my @outfile1; my @outfile2; foreach my $file(@dir){ my $input="$path/$file"; push (@input,$input); my $output1="$fasta1/$file"; push (@outfile1,$output1); my $output2="$fasta2/$file"; push (@outfile2,$output2); } my $sub_file_number = scalar @input; my $thread; my @threads; my $job_number=0; while(){ last if ($job_number eq $sub_file_number); while(scalar(threads->list())<$thread_number) { #set thread number $job_number++; my $input_file = $input[$job_number-1]; my $output_file1 = $outfile1[$job_number-1]; my $output_file2 = $outfile2[$job_number-1]; #print "Genbank_extraction: $job_number\n"; $threads[$job_number-1]=threads->new(\&genomenucleotide_extract, $input_file, $output_file1, $output_file2); last if ($job_number eq $sub_file_number); } foreach $thread(threads->list(threads::all)){ if($thread->is_joinable()) { $thread->join(); } } } foreach my $th (threads->list(threads::all)) { $th->join(); } } #################################################### #################################################### sub genomenucleotide_extract { my ($input, $output1, $output2)=@_; open (TEMP_IN, $input); my @temp_in = ; my $first_line = shift @temp_in; my $po_seq = join ("", @temp_in); open(OUTPUT_1, ">$output1"); open(OUTPUT_2, ">$output2"); my $filename = basename $input; if ($first_line =~ /^>/){ #input is fasta format my %hash_seq = parse_fasta($input); my $jdna; foreach (keys %hash_seq){ print OUTPUT_1 ">$_\n", print_sequence_into_file($hash_seq{$_}, 60), "\n"; $jdna .=$hash_seq{$_}; } print OUTPUT_2 ">$filename\n", print_sequence_into_file($jdna, 60); }else{ #input is genbank format my $DDD; my $in = new Bio::SeqIO( -file => $input, -format => 'genbank' ); #my $out = new Bio::SeqIO( -file => $output1, -format => 'fasta' ); while (my $seqObject = $in->next_seq()){ my $acc = $seqObject->accession; my $desc = $seqObject->desc; my $dna = $seqObject->seq(); my $id = $seqObject->display_id(); print OUTPUT_1 ">$id\n", print_sequence_into_file($dna, 60), "\n"; $DDD .=$dna; } $DDD=print_sequence_into_file($DDD, 60); # print_sequence_into_file($dna, 70); # 60 characters per line print OUTPUT_2 ">$filename\n$DDD"; } close OUTPUT_1; close OUTPUT_2; } # subroutine print_sequence_into_file # print $sequence into $FILE with $length character per line sub print_sequence_into_file{ my($sequence, $length) = @_; my $DNA; my $string; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ){ $DNA=substr($sequence, $pos, $length)."\n"; $string.=$DNA; } return $string; } ################################################################################################## ###### Subrounting--do bacth work ###### Function: ###### do bacth work ################################################################################################## sub batch_genenucleotide_TFT_extract { my ($path_genbank, $path_TFT, $path_gene, $thread_number, $prodigal_annotation, $path_fa)=@_; opendir PATH_GENBANK, $path_genbank or die "could not open $path_genbank"; my @path_genbank = readdir PATH_GENBANK; @path_genbank =grep ($_!~/^\./ ,@path_genbank); #delete hidden file . .. closedir PATH_GENBANK; my $sub_file_number = scalar @path_genbank; my @input; my @outfile_2; my @outfile_3; foreach my $file_genbank(@path_genbank){ $file_genbank =~ s/ /_/g; my $input = "$path_genbank/$file_genbank"; push (@input,$input); my $output_2 = "$path_TFT/$file_genbank.tbl"; push (@outfile_2,$output_2); my $output_3="$path_gene/$file_genbank.fasta"; push (@outfile_3,$output_3); } my $thread; my @threads; my $job_number=0; print " GenBank_extraction_percent: "; while(){ last if ($job_number eq $sub_file_number); while(scalar(threads->list())<$thread_number) { #set thread number my $progress_record = int (($job_number/$sub_file_number)*100); $job_number++; my $input_file = $input[$job_number-1]; my $output_file_2 = $outfile_2[$job_number-1]; my $output_file_3 = $outfile_3[$job_number-1]; my $GenBank_extraction_progress = int (($job_number/$sub_file_number)*100); print "$GenBank_extraction_progress%","..." if ($job_number == 1 or ( ($GenBank_extraction_progress%10 ==0) && ($progress_record <$GenBank_extraction_progress)) ); $threads[$job_number-1]=threads->new(\&genenucleotide_TFT_extract, $input_file, $output_file_2, $output_file_3, $prodigal_annotation, $path_fa); last if ($job_number eq $sub_file_number); } foreach $thread(threads->list(threads::all)){ if($thread->is_joinable()) { $thread->join(); } } } foreach my $th (threads->list(threads::all)) { $th->join(); } print "done\n"; } ################################################################################################## ###### Subrounting--genbank_sequence_TFT_extract ###### Function: ###### extract gene sequence and tbl (cds/rRNA/tRNA information) from genbank file ################################################################################################## sub genenucleotide_TFT_extract { my ($input, $output_2, $output_3, $prodigal_annotation, $path_fa)=@_; my $file_name = basename($input); open(OUTPUT_2, ">$output_2"); open(OUTPUT_3, ">$output_3"); my $in = new Bio::SeqIO( -file => $input, -format => 'genbank' ); while ( my $seqObject = $in->next_seq() ) { my $acc = $seqObject->accession; my @features = $seqObject->get_SeqFeatures(); my $count = 0; foreach (@features) { my $feat = $_; unless ($feat->primary_tag =~ /^CDS|^rRNA|^tRNA|^source/) { next; } my $primary; ($primary) = $feat->primary_tag; if ($primary eq "source") { print OUTPUT_2 ">Contig: $acc\n"; } else { $count++; #print "genome_$count\n"; my ($start, $stop); my $location = $feat->location; my $locString = $location->to_FTstring; if ($locString =~ /,/) { if ($locString =~ /complement/) { my @loc = split( /,/, $locString); if ($loc[0] =~ /(\d+)\.\.(\d+)/){ my $length_part = $2-$1+1; $loc[1] =~ /(\d+)\.\.(\d+)/; $start = $2; $stop = $1-$length_part; } else{ $loc[1] =~ /(\d+)\.\.(\d+)/; $start = $2; $stop = $1-1; } } else { my @loc = split( /,/, $locString ); if ($loc[0] =~ /(\d+)\.\.(\d+)/) { my $length_part = $2-$1+1; $loc[1] =~ /(\d+)\.\.(\d+)/; $stop = $2; $start = $1-$length_part; } else{ $loc[1] =~ /(\d+)\.\.(\d+)/; $stop = $2; $start = $1-1; } } } else { if ($locString =~ /complement$(.+)\.\.(.+)$/) { $start = $2; $stop = $1; } else { $locString =~ /(.+)\.\.(.+)/; $start = $1; $stop = $2; } } my $gene_name; if ( $feat->has_tag('gene') ) { ($gene_name) = $feat->get_tag_values('gene'); } my $locus_tag; if ( $feat->has_tag('locus_tag') ) { ($locus_tag) = $feat->get_tag_values('locus_tag'); }else {$locus_tag = "CDS_$acc"."_".$count;} my $product; if ( $feat->has_tag('product') ) { ($product) = $feat->get_tag_values('product'); $product =~ s/ /_/g; $product =~ s/;/_/g; $product =~ s/,/_/g; }else { $product = "No_product_tag"; } my $pseudo; if ( $feat->has_tag('pseudo') ) { $pseudo = "pseudo"; if (defined $gene_name){ print OUTPUT_2 "$start\t$stop\t$primary\t$gene_name;$locus_tag\t$product\t$acc\t$pseudo\n"; } else { print OUTPUT_2 "$start\t$stop\t$primary\t$locus_tag\t$product\t$acc\t$pseudo\n"; } } else { if (defined $gene_name){ print OUTPUT_2 "$start\t$stop\t$primary\t$gene_name;$locus_tag\t$product\t$acc\n"; } else { print OUTPUT_2 "$start\t$stop\t$primary\t$locus_tag\t$product\t$acc\n"; } } my $dna; if ( $feat->spliced_seq->seq ) { $dna = $feat->spliced_seq->seq; print OUTPUT_3 ">$locus_tag#$product;$file_name\n$dna\n"; } } } } close OUTPUT_2; close OUTPUT_3; my @feature = stat ($output_3); my $size = $feature[7]; my $sec_input = "$path_fa/$file_name"; &prodigal_tool($sec_input, $output_3, $output_2, $prodigal_annotation) if $size == 0; } ######################################### # bacth annnotate genome using prodigal ######################################### sub prodigal_bacth { my ($input_dir, $protein_dir, $gene_dir, $tft_dir, $thread_number, $prodigal_annotation) = @_; opendir (DIR_SEQ, $input_dir) or die "could not open $input_dir"; my @input_dir = readdir DIR_SEQ; @input_dir = grep ($_!~/^\./ ,@input_dir); closedir DIR_SEQ; my @input; my @outprotein; my @outgene; my @outtft; foreach(sort @input_dir){ push @input, "$input_dir/$_"; push @outprotein, "$protein_dir/$_"; push @outgene, "$gene_dir/$_"; push @outtft, "$tft_dir/$_"; } my $sub_file_number = scalar @input; my $thread; my @threads; my $job_number=0; print " Annotating genome using prodigal: "; while(){ last if ($job_number eq $sub_file_number); while(scalar(threads->list())<$thread_number) { #set threadnumber； my $progress_record = int (($job_number/$sub_file_number)*100); $job_number++; my $input_file = $input[$job_number-1]; my $output_protein = $outprotein[$job_number-1]; my $output_gene = $outgene[$job_number-1]; my $output_tft = $outtft[$job_number-1]; my $genome_annotation_percent = int (($job_number/$sub_file_number)*100); print "$genome_annotation_percent%","..." if ($job_number == 1 or ( ($genome_annotation_percent%10 ==0) && ($progress_record <$genome_annotation_percent)) ); $threads[$job_number-1]=threads->new(\&prodigal_tool, $input_file, $output_protein, $output_gene, $output_tft, $prodigal_annotation); last if ($job_number eq $sub_file_number); } foreach $thread(threads->list(threads::all)){ if($thread->is_joinable()) { $thread->join(); } } } foreach my $th (threads->list(threads::all)) { $th->join(); } print "done\n"; } ######################################### # gene and gff file obtain using prodigal tool ######################################### sub prodigal_tool { my ($input, $outprotein, $outgene, $outtft, $prodigal_annotation) =@_; $outprotein =~ s/.fasta$// if $prodigal_annotation eq "F"; $outgene =~ s/.fasta$// if $prodigal_annotation eq "F"; $outtft =~ s/.tbl$//; my $prodigal = `which prodigal`; # "/usr/bin/blastp"; $prodigal =~ s/\n//g; system ("$prodigal -i $input -f gbk -g 11 -q -p meta -a $outprotein -d $outgene -o $outtft"); &gene_file($outprotein); &gene_file($outgene); &tbl_file($outtft); } ######################################### # gene file conversion ######################################### sub gene_file { my ($out) = @_; #$out =~ s/.fasta$//; my $genome_name = basename($out); my $id = substr($genome_name, 0, 15); my $count=0; my $term = $/; # input record separator; my %seq_hash; open (FAS, $out); $/ = ">"; # input record separator; open (OUT, ">$out.fasta"); while(){ chomp; next if($_ eq ''); $count++; my ($fid, @sequencelines) = split /\n/; foreach my $line (@sequencelines) { $seq_hash{$fid} .= $line; } my $gene = $id."_".$count; print OUT ">$gene#XXX;$genome_name\n", $seq_hash{$fid}, "\n"; } $/ = $term; close FAS; close OUT; system("rm -rf $out"); } # end of gene_file ######################################### # gff file was transfomed to tbl file ######################################### sub tbl_file { my ($gff) = @_; my $genome_name = basename($gff); my $id = substr($genome_name, 0, 15); my $count =0; my %contig; open (IN, $gff); open (OUT, ">$gff.tbl"); while(){ chomp; if ($_ =~ /seqhdr/){ my $contig = $_; $contig =~ s/.*?"//; $contig =~ s/".*//; print OUT ">Contig: $contig\n"; $contig{$genome_name}=$contig; } if ($_ =~ /^ CDS/){ $count++; my $cds = $_; $cds =~ s/^ CDS.*?(\S)/$1/;#complement(571..1734) $cds =~ s/$|$//g; my ($s,$e); if ($cds =~ /complement/){ $cds =~ s/complement//; $cds =~ /(.*?)\.\.(.*)/; $s= $2; $e= $1; }else{ $cds =~ /(.*?)\.\.(.*)/; $s= $1; $e= $2; } my $gene = $id."_".$count; print OUT "$s\t$e\tCDS\t$gene\tXXX\t$contig{$genome_name}\n"; } } close IN; close OUT; system("rm -rf $gff"); } # end of gene_file sub parse_fasta { my ($infile) = @_; my $term = $/; # input record separator; my %seq_hash = (); # key:seqid, val:seq open my $INFILE, $infile or die "could not open infile '$infile' : $! \n"; $/ = ">"; # input record separator; while(<$INFILE>) { chomp; next if($_ eq ''); my ($id, @sequencelines) = split /\n/; foreach my $line (@sequencelines) { $seq_hash{$id} .= $line; } } $/ = $term; return(%seq_hash); } # end of parse_fasta ########################################################### ########################################################### sub bacth_blast_best { my ($work_number, $thread_number, $subfunction, $sub_parameters) = @_; my ($input_file, $db_file, $output_file, $e_value, $blastn) = @$sub_parameters; my @input = @$input_file; my @outfile = @$output_file; my $thread; my @threads; my $job_number=0; print " Blastn_percent: "; while(){ last if ($job_number>=$work_number); while(scalar(threads->list())<$thread_number) { #set threadnumber； my $progress_record = int (($job_number/$work_number)*100); $job_number++; my $input_file = $input[$job_number-1]; my $output_file = $outfile[$job_number-1]; my $Blast_perform_progress = int (($job_number/$work_number)*100); print "$Blast_perform_progress%","..." if ($job_number == 1 or ( ($Blast_perform_progress%10 ==0) && ($progress_record <$Blast_perform_progress)) ); $threads[$job_number-1]=threads->new(\&$subfunction, $input_file, $db_file, $output_file, $e_value, $blastn); last if ($job_number>=$work_number); } foreach $thread(threads->list(threads::all)){ if($thread->is_joinable()) { $thread->join(); } } } foreach my $th (threads->list(threads::all)) { $th->join(); } print "done\n"; } #****** Warining: this subrouine is only used to find the best hit for each query gene against database, and a loose cutoff for evalue sub blastn_best { my ($input, $db, $outfile, $e, $b) = @_; my $cmd = "$b -outfmt '6 qseqid sseqid pident qlen slen length qstart qend sstart send evalue bitscore' -max_target_seqs 2 -evalue $e -num_threads 1 -query $input -db $db -out $outfile"; system("$cmd 2>/dev/null"); #BLASTn -max_target_seqs -num_alignments } ######################################################### ######################################################### sub result_statistics { my ($workplace, $tbl, $mapping_result) = @_; my $result_statistics_workplace = "$workplace/result_statistics"; mkdir $result_statistics_workplace; my $tbl_folder = extract_tbl($result_statistics_workplace, $tbl, $mapping_result); # obtain part_tbl folder my $output_statistics = "$result_statistics_workplace/Statistics_OUT"; mkdir $output_statistics; #print "$result_statistics_workplace\n$tbl_folder\n$output_statistics\n"; my $out1 = "$output_statistics/Gene_number_statistics"; my $out2 = "$output_statistics/CPS_number_statistics"; my $out3 = "$output_statistics/Gene_LocusTag_statistics"; my $out4 = "$output_statistics/CPS_LocusTag_statistics"; my $out_classs_gene = "$output_statistics/classification_gene"; my $out_class = "$output_statistics/classification_CPS"; open (OUT_1, ">$out1") or die "could not open infile $out1\n"; open (OUT_2, ">$out2") or die "could not open infile $out2\n"; open (OUT_3, ">$out3") or die "could not open infile $out3\n"; open (OUT_4, ">$out4") or die "could not open infile $out4\n"; my $temp_total_tbl = "$result_statistics_workplace/temp_total_tbl"; chdir $tbl_folder; #system ("cat * > $temp_total_tbl"); system ("find . ! -name '.' | xargs cat > $temp_total_tbl"); # all files not be .; For large number of files, directly using cat may causing error ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ my %gene_hash; my %class_hash; open (TEMP_TBL, $temp_total_tbl); my @temp_tbl = ; close TEMP_TBL; foreach (@temp_tbl){ chomp; my @array = split "\t", $_; my $new_keys = "$array[1]|$array[0]"; # class|gene $gene_hash{$new_keys} = $array[1]; $class_hash{$array[1]} = $array[0]; #print "$array[0]\t$array[1]\n"; } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print OUT_1 "Gene_type\t"; print OUT_3 "Gene_type\t"; my $type_check =""; foreach (sort keys %gene_hash) { $_ =~ s/\|.*//; $type_check = $_; ################################################################################### # if ($type_check ne $_){ # $type_check = $_; print OUT_1 "$type_check\t"; # print class gene name print OUT_3 "$type_check\t"; # print class gene name # }else{ # print OUT_1 "\t"; # print class gene name # print OUT_3 "\t"; # print class gene name # } ################################################################################### } print OUT_1 "\nStrain\t"; print OUT_2 "Strain\t"; print OUT_3 "\nStrain\t"; print OUT_4 "Strain\t"; foreach (sort keys %gene_hash) { $_ =~ s/(.*)\|//; print OUT_1 "$_\t"; # print class gene name print OUT_3 "$_\t"; # print class gene name } print OUT_1 "\n"; print OUT_3 "\n"; foreach (sort keys %class_hash) { print OUT_2 "$_\t"; # print class name print OUT_4 "$_\t"; # print class name } print OUT_2 "\n"; print OUT_4 "Serotype/number/frequency\n"; # added by xiangyang 2022-02-13 ###------------------------------------------------------------------------------------------------------------------------- opendir (TBL_FOLDER, $tbl_folder); my @total_protein = readdir TBL_FOLDER; @total_protein =grep ($_!~/^\./ ,@total_protein); #delete hidden file . .. @total_protein =grep ($_!~/\~$/ ,@total_protein); #delete temp file ~ closedir TBL_FOLDER; my (@array_gene_num, @array_class_num); foreach my $eachfile (sort @total_protein) { open (EACHFILE, "$tbl_folder/$eachfile") or die "could not open infile $eachfile\n"; print OUT_1 "$eachfile\t"; print OUT_2 "$eachfile\t"; print OUT_3 "$eachfile\t"; print OUT_4 "$eachfile\t"; my (@arr_temp_each1, @arr_temp_each2); my (%hash_LocusTag1, %hash_LocusTag2); while() { chomp; my @sequencelines = split /\t/; push (@arr_temp_each1, $sequencelines[0]); #class_gene (gene) push (@arr_temp_each2, $sequencelines[1]); #class (class) $hash_LocusTag1{$sequencelines[5]} = $sequencelines[0]; #keys: LocusTag; values: class_gene (gene) $hash_LocusTag2{$sequencelines[5]."|".$sequencelines[0]."|".$sequencelines[4]} = $sequencelines[1]; #keys: LocusTag; values: class (class) #added by xiangyang 2022-02-14 } close EACHFILE; #print join (": ", @arr_temp_each1); #print "\n"; my @array_num_temp1; foreach my $keygene (sort keys %gene_hash) { #print "$keygene\n"; $keygene =~ s/(.*)\|//; my $macth_num1 = grep ($keygene eq $_, @arr_temp_each1); print OUT_1 "$macth_num1\t"; push @array_num_temp1, $macth_num1; # recode gene number temply; my @macth_num1 = grep ($keygene eq $_, @arr_temp_each1); my @macth_LocusTag1; foreach my $key_LocusTag (keys %hash_LocusTag1) { push(@macth_LocusTag1, $key_LocusTag) if grep ($hash_LocusTag1{$key_LocusTag} eq $_, @macth_num1); } print OUT_3 join (": ", @macth_LocusTag1), "\t"; } print OUT_1 "\n"; print OUT_3 "\n"; @array_num_temp1 = sort{$b<=>$a}@array_num_temp1; push @array_gene_num, $array_num_temp1[0]; my @serotype; # added by xiangyang 2022-02-13 my @array_num_temp2; foreach my $keyclass (sort keys %class_hash) { my $macth_num2 = grep ($keyclass eq $_, @arr_temp_each2); print OUT_2 "$macth_num2\t"; push @array_num_temp2, $macth_num2; # recode class number temply; my @macth_num2 = grep ($keyclass eq $_, @arr_temp_each2); my @macth_LocusTag2; foreach my $key_LocusTag (keys %hash_LocusTag2) { push(@macth_LocusTag2, $key_LocusTag) if grep ($hash_LocusTag2{$key_LocusTag} eq $_, @macth_num2); my $c_nu = $key_LocusTag; # added by xiangyang 2022-02-13 $c_nu =~ m/(.*?)\|(.*?)\|(.*?)/; # added by xiangyang 2022-02-13 my $cc_nu = $2; # added by xiangyang 2022-02-13 $cc_nu =~ s/.*\-//g; # added by xiangyang 2022-02-13 push @serotype, $cc_nu if grep ($hash_LocusTag2{$key_LocusTag} eq $_, @macth_num2);# added by xiangyang 2022-02-13 } print OUT_4 join (": ", @macth_LocusTag2), "\t"; } print OUT_2 "\n"; ########################################################## # added by xiangyang 2022-02-13 my @f_arr; foreach my $c_count (@serotype){ push @f_arr, (split /#/, $c_count) if $c_count =~ /#/; push @f_arr, $c_count if $c_count !~ /#/; } my %c_hash; foreach (@f_arr){ $c_hash{$_}++; } my $key_max; my @temp_max = sort {$a<=>$b} values %c_hash; my $c_max= $temp_max[-1]; for my $c_key (keys %c_hash) { if ($c_hash{$c_key} == $c_max){ $key_max .= "$c_key|"; #multipe values have the same keys for max } } $key_max =~ s/\|$//; #delete "|" at the end of the string my $number = scalar @serotype; my $occur_pro = $c_max/(scalar @serotype)*100; # added by xiangyang 2022-02-13 $occur_pro = sprintf "%.2f",$occur_pro; # added by xiangyang 2022-02-13 ########################################################### print OUT_4 "Serotype $key_max/$number/$occur_pro","%\n"; # added by xiangyang 2022-02-13 @array_num_temp2 = sort{$b<=>$a}@array_num_temp2; push @array_class_num, $array_num_temp2[0]; } @array_gene_num = sort{$b<=>$a}@array_gene_num; my $max_gene_number = $array_gene_num[0]; @array_class_num = sort{$b<=>$a}@array_class_num; my $max_class_number = $array_class_num[0]; #print "max_gene_number: $max_gene_number\n"; #print "max_class_number: $max_class_number\n"; close OUT_1; close OUT_2; close OUT_3; close OUT_4; system("rm $temp_total_tbl"); my $tbl_heatmap_dir_gene = tbl_heatmap_generation($out1, $result_statistics_workplace, "tbl_heatmap_gene", $out_classs_gene, $max_gene_number); my $tbl_heatmap_dir_class = tbl_heatmap_generation($out2, $result_statistics_workplace, "tbl_heatmap_class", $out_class, $max_class_number); ###------------------------------------------------------------------------------------------------------------------------- } ######################################################### ######################################################### sub extract_tbl { my ($home_directory, $tbl_folder, $mapping_result) = @_; my $out_protein_folder = "$home_directory/tbl_part"; mkdir $out_protein_folder; #system ("rm $out_protein_folder/*"); my $temp_list_split_dir = split_list_tbl ($mapping_result, $home_directory); opendir (TBL_FOLDER, $tbl_folder); my @total_protein = readdir TBL_FOLDER; @total_protein =grep ($_!~/^\./ ,@total_protein); #delete hidden file . .. closedir TBL_FOLDER; foreach my $eachfile (@total_protein) { my %seq_hash; open (EACHFILE, "$tbl_folder/$eachfile") or die "could not open infile $eachfile\n"; while() { chomp; next if($_ eq ''); my @sequencelines = split /\t/ if $_ !~ /^>Contig/; $sequencelines[3] =~ s/.*;//g if $_ !~ /^>Contig/; $seq_hash{$sequencelines[3]} = $_ if $_ !~ /^>Contig/; #key: locus tag; value: all feature in one row in tbl file } close EACHFILE; my $matck_file = $eachfile; # added by xiangyang 2022-02-15 $matck_file =~ s/\.tbl.*part/\.tbl/g; # added by xiangyang 2022-02-15 if (-e "$temp_list_split_dir/$matck_file") { # replace $eachfile with $matck_file added by xiangyang 2022-02-15 open (LIST_TP, "$temp_list_split_dir/$matck_file"); # replace $eachfile with $matck_file added by xiangyang 2022-02-15 while (){ chomp; my @array=split ("\t", $_); $array[0] =~ s/#/;/g; my @array_2 = split ";", $array[0]; ########################################################################################## my $fhs; my $source = $array_2[2]; #print $source, "\n"; unless( $fhs->{$source} ) { #当hash键(第一列)不存在时，创建字符串句柄 open my $fh, ">>$out_protein_folder/$source" or die '...'; $fhs->{$source} = $fh; } my $identity = $seq_hash{$array_2[0]} if defined $seq_hash{$array_2[0]}; # added by xiangyang 2022-02-13 my $iden_cov = $array[8]."_".$array[9]."-".$array[10]; # added identity and qcoverage-scoverage for blastn from CPS_cluster_mapping.result file by xiangyang 2022-02-13 $identity =~ s/\tCDS\t/\t\Q$iden_cov\E\t/ if defined $seq_hash{$array_2[0]}; # added by xiangyang 2022-02-13 $identity =~ s/\\//g; #$identity = sprintf "%.2f",$identity; print { $fhs->{$source} } "$array[2]\t$array[4]\t$identity\n" if defined $seq_hash{$array_2[0]}; ########################################################################################## } close LIST_TP; } } system("rm -rf $temp_list_split_dir"); return $out_protein_folder; } ######################################################### ######################################################### sub split_list_tbl { my ($list, $home_directory) = @_; my $temp_list_split_dir = "$home_directory/temp_tbl_split_dir"; mkdir $temp_list_split_dir; open (LIST_PARSE, "$list"); while (){ chomp; $_ =~ s/\t(.*)//g; my $right_content = $1; ########################################################################################## my $fhs; my $source = $_; $source =~ s/.*;//g; unless( $fhs->{$source} ) { #当hash键(第一列)不存在时，创建字符串句柄 open my $fh, ">>$temp_list_split_dir/$source.tbl" or die '...'; $fhs->{$source} = $fh; } print { $fhs->{$source} } "$_\t$right_content\n"; ########################################################################################## } close LIST_PARSE; return $temp_list_split_dir; } ######################################################### ######################################################### sub tbl_heatmap_generation { my ($list, $home_directory, $directory_name, $out_classification, $max_number) = @_; my $tbl_heatmap_dir = "$home_directory/$directory_name"; mkdir $tbl_heatmap_dir; open (LIST_PARSE, "$list"); my @list_parse = ; my @Class_name = split "\t", shift @list_parse; open (OUT_CLASSIFICATION, ">$out_classification") or die "could not open infile $out_classification\n"; print OUT_CLASSIFICATION "$max_number\n"; ####### caculate the number of gene number belonging to specific class my %hash_number; #print "number: ", scalar @Class_name, "\n"; foreach my $element_class (@Class_name){ if ($element_class !~ /^\s|^Strain/){ $hash_number{$element_class} = grep ($element_class eq $_, @Class_name); } } my %count; my @Class_name_new = grep { ++$count{ $_ } < 2; } @Class_name; foreach (@Class_name_new){ print OUT_CLASSIFICATION "Class_type\t$_\t", $hash_number{$_}, "\n" if $_ !~ /^\s|^Strain|^Gene_type/; } #######caculate the number of gene number belonging to specific class my @gene_name = split "\t", shift @list_parse if $list =~ /Gene_number_statistics/; if ($list =~ /Gene_number_statistics/){ foreach my $element_gene (@gene_name){ if ($element_gene !~ /^\s|^Strain/){ $hash_number{$element_gene} = grep ($element_gene eq $_, @gene_name); } } my $index_gene=0; foreach (@gene_name){ print OUT_CLASSIFICATION "Gene_type\t$_\t", $hash_number{$_}, "\n" if $_ !~ /^\s|^Strain/; $index_gene++; } } close OUT_CLASSIFICATION; foreach (@list_parse){ chomp; my @array=split ("\t", $_); ########################################################################################## my $fhs; my $source = shift @array; unless( $fhs->{$source} ) { #当hash键(第一列)不存在时，创建字符串句柄 open my $fh, ">>$tbl_heatmap_dir/$source.tbl" or die '...'; $fhs->{$source} = $fh; } my $length_count=0; foreach(@array){ my $start = 1+50*$length_count; my $end = 50*($length_count+1); $length_count++; print { $fhs->{$source} } "$start\t$end\tCDS\t$gene_name[$length_count]\t$_\n" if $list =~ /Gene_number_statistics/; print { $fhs->{$source} } "$start\t$end\tCDS\t$Class_name[$length_count]\t$_\n" if $list !~ /Gene_number_statistics/; } ########################################################################################## } close LIST_PARSE; return $tbl_heatmap_dir; } ######################################################### ######################################################### sub mapping { my ($mappingFile, $filter_blast_cluster, $cluster_mapping)= @_; my %mapping; open (MAPPING_OUT, ">$cluster_mapping"); open (MAPPING, $mappingFile); my $cout_test=0; while (my $line = ) { chomp($line); $line =~ s/\r//g; ## BacMet-Scan Version 2 $cout_test++; my ($GI_number, $GenBank_ID, $Gene_name, $Organism, $Compound, $NCBI_annotation) = split("\t",$line); $mapping{$GI_number} = "$Gene_name\t$Organism\t$Compound\t$NCBI_annotation"; $mapping{$GenBank_ID} = "$Gene_name\t$Organism\t$Compound\t$NCBI_annotation"; } close MAPPING; open (ALL_ALL, $filter_blast_cluster); while (my $ele = ) { chomp($ele); next unless $ele !~ /Qseqid/; #??? my @arr_temp = split('\t',$ele); print MAPPING_OUT "$arr_temp[0]\t$arr_temp[1]\t$mapping{$arr_temp[1]}\t$arr_temp[2]\t$arr_temp[3]\t$arr_temp[4]\t$arr_temp[5]\t$arr_temp[6]\t$arr_temp[7]\n"; } close ALL_ALL; close MAPPING_OUT; return $cluster_mapping; } ######################################################### ######################################################### sub SARG_map_transformation { my ($database, $list, $map_table) = @_; my %hash_id; open (SEQ, $database); while (){ chomp; if (/^>(.+)/) { my $id = $1; $id =~ s/ /\t/; $id =~ s/\[/\t\[/ if $id =~ /\[/; #print "SEQ: $id\n"; my @arr_id = split "\t", $id; $hash_id{$arr_id[0]} = "$arr_id[1]\t$arr_id[2]" if $id =~ /\[/; $hash_id{$arr_id[0]} = "$arr_id[1]\t[Organism_unavailable]" if $id !~ /\[/; #print "$count\t$arr_id[0]\t$hash_id{$arr_id[0]}\n"; } } close SEQ; open (IN, $list); open (OUT, ">$map_table"); print OUT "GI_number\tCategories_in_database\tGene_name\tOrganism\tCompound\tNCBI_annotation\n"; my %text_hash; while (){ chomp $_; next unless $_ =~ /\_\_/; $_ =~ s/'//g; $_ =~ s/, /,/g; $_ =~ s/[\[\]]//g; my @array = split "\t", $_; my @array_id = split /,/, $array[1]; foreach my $id(@array_id) { $text_hash{$id} = $array[0]; } } close IN; foreach my $key (keys %text_hash) { my $value = $text_hash{$key}; my @value_split = split "\_\_", $value; my ($NCBI_annotation_seq, $Organism_seq) = split "\t", $hash_id{$key} if defined $hash_id{$key}; print OUT "$key\t$value\t$value_split[1]\t$Organism_seq\t$value_split[0]\t$NCBI_annotation_seq\n" if defined $hash_id{$key}; } close OUT; } ######################################################### ######################################################### sub split_to_subfiles{ my ($file,$split,$output_dir,$log_file); open(my $out_fh, '>-') or die "Could not open stdout for writing"; if ( @_ && $_[0] eq __PACKAGE__) { GetOptions('i|input-file=s' => \$file, 'o|output-dir=s' => \$output_dir, 'n|split-number=i' => \$split, 'l|log-file=s' => \$log_file) or die "Invalid options\n"; } else { ($file,$split,$output_dir,$log_file) = @_; } #this will find records with a quickness (my $total) = `grep -c ">" $file`; chomp $total; die "no fasta records identified, exiting.\n" unless $total; my $records_per_file = int ($total / $split); my $leftover_records = $total % $split; if ($split > $total) { warn "Total records in file ($total) less than splitnum ($split); adjusting toatal\n"; $records_per_file = 1; } my $input_file_name = basename($file); my $output_base_path = (defined $output_dir) ? "$output_dir/$input_file_name" : $input_file_name; my $in = new Bio::SeqIO (-file=>$file, -format=>"fasta"); my $x; my @outs; my $records; for my $x (1..$split) { #adjust # of records per file for leftover records my $adjusted_records_per_file; $adjusted_records_per_file = $records_per_file+1 if $x <= $leftover_records; push @outs, new Bio::SeqIO (-file=>">$output_base_path.$x", -format=>"fasta") } my $out = shift @outs; my $filecounter = 1; my $recordcounter =1; while (my $seq = $in->next_seq) { my $adjusted_records_per_file = $filecounter<=$leftover_records?$records_per_file+1:$records_per_file; $out->write_seq($seq); if (++$records>=$adjusted_records_per_file) { $out = shift @outs; $records =0; $filecounter++; } } } ######################################################### ######################################################### sub blast_filter { #blastn 2023-01-10 xiangyang li edited my ($infile, $e_value, $identify, $coverage, $match_length, $all_vs_all_cluster, $temp_s) = @_; open (INFILE, $infile); open (TEMP_S, ">$temp_s") if defined $temp_s; my $output_title = "Qseqid\tSseqid\tBitscore\tE-value\tPidenty\tQ_coverage\tS_coverage\tMacth_length\tQlength"; open (OUTFILE, ">$all_vs_all_cluster"); print OUTFILE "$output_title\n"; my @infile; while(){ chomp; $_ =~ s/ //g; my @blast_result0=split '\t', $_; my $qcover0 = ($blast_result0[7]-$blast_result0[6]+1)/$blast_result0[3] * 100; #added by xiangyang $qcover0 = sprintf("%.2f", $qcover0); my $scover0 = ($blast_result0[9]-$blast_result0[8]+1)/$blast_result0[4] * 100; #added by xiangyang $scover0 = sprintf("%.2f", $scover0); if (($blast_result0[10] <= $e_value) && ($blast_result0[2] >= $identify) && ($blast_result0[5] >= $match_length)){ if ($qcover0 >= $coverage){ if ($scover0 >= $coverage){ push @infile, $_; } else{ if ( (defined $temp_s) && ($scover0 >= 50) ){ #store subject gene id when the length of query gene is smaller than that of subject gene print TEMP_S "$blast_result0[1]\t$blast_result0[0]\t$blast_result0[2]\t$blast_result0[4]\t$blast_result0[3]\t$blast_result0[5]\t$blast_result0[8]\t$blast_result0[9]\t$blast_result0[6]\t$blast_result0[7]\t$blast_result0[10]\t$blast_result0[11]\n" ; } } } else{ push @infile, $_ if $scover0 == 100.00; } } } close INFILE; close TEMP_S if defined $temp_s; my $add_row = "Addition_row Addition_row 100 1000 1100 1000 1 1000 1 1100 0.0 0"; push @infile, $add_row; my $row=0; my %st_hash; for(my $i=0; $i 0; return $temp_s if defined $temp_s; #return $all_vs_all_cluster; #return $temp_s if defined $temp_s; } ######################################################### ######################################################### sub de_repeat_array { my @array = @_; my %hash; #定义一个空hash my @result=grep {++$hash{$_}<2} @array; #去除冗余元素 #print join ("\n", @result), "\n"; return @result; } sub check_tool { ###Test-step1: Checks for pneumo-typer dependencies my ($script, $dir) = @_; print "\n\nTest-step1: Checks for pneumo-typer dependencies...\n"; print "################################################################\n"; #Checks for perl modules &check_perl_modules; print "\n----------------------------------------------------------------\n"; #Chechs for additional programs dependencies &check_programs; print "################################################################\n"; ###Test-step2: Begin test pneumo-typer.pl with the test_data print "\n\n\n\n\nTest-step2: Begin test pneumo-typer.pl...\n"; print "################################################################\n"; my $check_g = system ("perl $script -d $dir -t 10 -m T -c T -p T"); print "################################################################\n"; if ($check_g eq 0){ print "Ok, pneumo-typer works successfully!!\n\n"; }else { print "Not Ok, pneumo-typer works with some errors!\n\n"; } } #check Perl modules dependencies; sub check_perl_modules { my @test_modules = ("GD", "GD::SVG", "SVG", "threads", "File::Basename", "FindBin", "lib", "Getopt::Long", "Math::BigFloat", "Storable", "vars", "File::Spec", "Bio::SeqIO", "Bio::Tree::NodeI", "Bio::TreeIO"); my $check_number=0; foreach (@test_modules){ my $cmd = "perl -M$_ -e 'print \"***$_ Version\\t \".$_->VERSION.\"\tok.\n\"'"; #perl -MGD::SVG -e 'print GD::SVG->VERSION. "\n"' my $test_out = system ($cmd); print "***Warning:\t$_ is not installed\n" if $test_out ne 0; $check_number++ if $test_out ne 0; } if ($check_number == 0){ print "!!!Ok, all dependencies Perl modulers are installed*\n"; }else{ print "!!!Not ok, it appears that certain dependencies Perl modulers are installed*\n"; } } # checks software dependencies sub check_programs{ print "Checking for makeblastdb ... "; my $makeblastdb_path = `which makeblastdb`; chomp $makeblastdb_path; if (not -e $makeblastdb_path){ print "error: makeblastdb is not installed\n"; } else{ print "OK, makeblastdb is installed at: $makeblastdb_path\n"; } print "Checking for blastn ... "; my $blastn_path = `which blastn`; chomp $blastn_path; if (not -e $blastn_path){ print "error: blastn is not installed\n"; } else{ print "OK, blastn is installed at: $blastn_path\n"; } print "Checking for prodigal ... "; my $prodigal_path = `which prodigal`; chomp $prodigal_path; if (not -e $prodigal_path){ print "error: prodigal is not installed\n"; } else{ print "OK, prodigal is installed at: $prodigal_path\n"; } print "Checking for blat ... "; my $blat_path = `which blat`; chomp $blat_path; if (not -e $blat_path){ print "error: blat is not installed\n"; } else{ print "OK, blat is installed at: $blat_path\n"; } } #################################################### #################################################### sub batch_genomenucleotide_extract_run { my ($path, $fasta1, $fasta2, $thread_number)=@_; opendir DIR, $path or die $!; ################################start rename the file in following case my @path_temp = readdir DIR; foreach (@path_temp){ if (/[ =]/) { my $new_name = $_; $new_name =~ s/ /_/g; #repalce the space with "_" for all GenBank files $new_name =~ s/=/_/g; #repalce "=" with "_" for all GenBank files $new_name =~ s/_+/_/g; system ("mv '$path/$_' '$path/$new_name'"); } } closedir DIR; #################################end rename the file in following case opendir DIR, $path or die $!; my @dir = readdir DIR; @dir = grep ($_!~/^\./, @dir); closedir DIR; my $job_number=0; my $sub_file_number = scalar @dir; print " Genomenucleotide_extract_percent: "; use Para::Runner; my $runner = Para::Runner->new($thread_number); foreach my $file(@dir){ my $input="$path/$file"; my $output1="$fasta1/$file"; my $output2="$fasta2/$file"; my $progress_record = int (($job_number/$sub_file_number)*100); $job_number++; my $Genomenucleotide_extract_progress = int (($job_number/$sub_file_number)*100); print "$Genomenucleotide_extract_progress%","..." if ($job_number == 1 or ( ($Genomenucleotide_extract_progress%10 ==0) && ($progress_record <$Genomenucleotide_extract_progress)) ); $runner->run( sub{ open (TEMP_IN, $input); my @temp_in = ; my $first_line = shift @temp_in; my $po_seq = join ("", @temp_in); open(OUTPUT_1, ">$output1"); open(OUTPUT_2, ">$output2"); my $filename = basename $input; if ($first_line =~ /^>/){ #input is fasta format my %hash_seq = parse_fasta($input); my $jdna; foreach (keys %hash_seq){ print OUTPUT_1 ">$_\n", print_sequence_into_file($hash_seq{$_}, 60), "\n"; $jdna .=$hash_seq{$_}; } print OUTPUT_2 ">$filename\n", print_sequence_into_file($jdna, 60); }else{ #input is genbank format my $DDD; my $in = new Bio::SeqIO( -file => $input, -format => 'genbank' ); while (my $seqObject = $in->next_seq()){ my $acc = $seqObject->accession; my $desc = $seqObject->desc; my $dna = $seqObject->seq(); my $id = $seqObject->display_id(); print OUTPUT_1 ">$id\n", print_sequence_into_file($dna, 60), "\n"; $DDD .=$dna; } $DDD=print_sequence_into_file($DDD, 60); # print_sequence_into_file($dna, 70); # 60 characters per line print OUTPUT_2 ">$filename\n$DDD"; } close OUTPUT_1; close OUTPUT_2; } )#run end } $runner->finish; print "done\n"; } ################################################################################################## ###### Subrounting--do bacth work ###### Function: ###### do bacth work ################################################################################################## sub batch_genenucleotide_TFT_extract_run { my ($path_genbank, $path_TFT, $path_protein, $path_gene, $thread_number, $prodigal_annotation, $path_fa)=@_; opendir PATH_GENBANK, $path_genbank or die "could not open $path_genbank"; my @path_genbank = readdir PATH_GENBANK; @path_genbank =grep ($_!~/^\./ ,@path_genbank); #delete hidden file . .. closedir PATH_GENBANK; my $job_number=0; my $sub_file_number = scalar @path_genbank; print " Genenucleotide_TFT_extract_percent: "; use Para::Runner; my $runner = Para::Runner->new($thread_number); foreach my $file_genbank(@path_genbank){ $file_genbank =~ s/ /_/g; my $input = "$path_genbank/$file_genbank"; my $output_1 = "$path_protein/$file_genbank.fasta"; my $output_2 = "$path_TFT/$file_genbank.tbl"; my $output_3 = "$path_gene/$file_genbank.fasta"; my $progress_record = int (($job_number/$sub_file_number)*100); $job_number++; my $Genenucleotide_TFT_extract_progress = int (($job_number/$sub_file_number)*100); print "$Genenucleotide_TFT_extract_progress%","..." if ($job_number == 1 or ( ($Genenucleotide_TFT_extract_progress%10 ==0) && ($progress_record <$Genenucleotide_TFT_extract_progress)) ); $runner->run( sub{ my $file_name = basename($input); open(OUTPUT_1, ">$output_1"); open(OUTPUT_2, ">$output_2"); open(OUTPUT_3, ">$output_3"); my $in = new Bio::SeqIO( -file => $input, -format => 'genbank' ); while ( my $seqObject = $in->next_seq() ) { my $acc = $seqObject->accession; my @features = $seqObject->get_SeqFeatures(); my $count = 0; foreach (@features) { my $feat = $_; unless ($feat->primary_tag =~ /^CDS|^rRNA|^tRNA|^source/) { next; } my $primary; ($primary) = $feat->primary_tag; if ($primary eq "source") { print OUTPUT_2 ">Contig: $acc\n"; } else { $count++; my ($start, $stop); my $location = $feat->location; my $locString = $location->to_FTstring; if ($locString =~ /,/) { if ($locString =~ /complement/) { my @loc = split( /,/, $locString); if ($loc[0] =~ /(\d+)\.\.(\d+)/){ my $length_part = $2-$1+1; $loc[1] =~ /(\d+)\.\.(\d+)/; $start = $2; $stop = $1-$length_part; } else{ $loc[1] =~ /(\d+)\.\.(\d+)/; $start = $2; $stop = $1-1; } } else { my @loc = split( /,/, $locString ); if ($loc[0] =~ /(\d+)\.\.(\d+)/) { my $length_part = $2-$1+1; $loc[1] =~ /(\d+)\.\.(\d+)/; $stop = $2; $start = $1-$length_part; } else{ $loc[1] =~ /(\d+)\.\.(\d+)/; $stop = $2; $start = $1-1; } } } else { if ($locString =~ /complement$(.+)\.\.(.+)$/) { $start = $2; $stop = $1; } else { $locString =~ /(.+)\.\.(.+)/; $start = $1; $stop = $2; } } my $gene_name; if ( $feat->has_tag('gene') ) { ($gene_name) = $feat->get_tag_values('gene'); } my $locus_tag; if ( $feat->has_tag('locus_tag') ) { ($locus_tag) = $feat->get_tag_values('locus_tag'); }else {$locus_tag = "CDS_$acc"."_".$count;} my $product; if ( $feat->has_tag('product') ) { ($product) = $feat->get_tag_values('product'); $product =~ s/ /_/g; $product =~ s/;/_/g; $product =~ s/,/_/g; }else { $product = "No_product_tag"; } my $pseudo; if ( $feat->has_tag('pseudo') ) { $pseudo = "pseudo"; if (defined $gene_name){ print OUTPUT_2 "$start\t$stop\t$primary\t$gene_name;$locus_tag\t$product\t$acc\t$pseudo\n"; } else { print OUTPUT_2 "$start\t$stop\t$primary\t$locus_tag\t$product\t$acc\t$pseudo\n"; } } else { if (defined $gene_name){ print OUTPUT_2 "$start\t$stop\t$primary\t$gene_name;$locus_tag\t$product\t$acc\n"; } else { print OUTPUT_2 "$start\t$stop\t$primary\t$locus_tag\t$product\t$acc\n"; } } my $protein_seqeunce; if ( $feat->has_tag('translation')) { ($protein_seqeunce) = $feat->get_tag_values('translation'); print OUTPUT_1 ">$locus_tag#$product;$file_name\n$protein_seqeunce\n"; } my $dna; if ( $feat->spliced_seq->seq ) { $dna = $feat->spliced_seq->seq; print OUTPUT_3 ">$locus_tag#$product;$file_name\n$dna\n"; } } } } close OUTPUT_2; close OUTPUT_3; my @feature = stat ($output_3); my $size = $feature[7]; my $sec_input = "$path_fa/$file_name"; &prodigal_tool($sec_input, $output_1, $output_3, $output_2, $prodigal_annotation) if $size == 0; } )#run end } $runner->finish; print "done\n"; } ######################################### # bacth annnotate genome using prodigal ######################################### sub prodigal_bacth_run { my ($input_dir, $protein_dir, $gene_dir, $tft_dir, $thread_number, $prodigal_annotation) = @_; my $prodigal = `which prodigal`; $prodigal =~ s/\n//g; opendir (DIR_SEQ, $input_dir) or die "could not open $input_dir"; my @input_dir = readdir DIR_SEQ; @input_dir = grep ($_!~/^\./ ,@input_dir); closedir DIR_SEQ; my $sub_file_number = scalar @input_dir; my $job_number=0; print " Annotating genome using prodigal: "; use Para::Runner; my $runner = Para::Runner->new($thread_number); foreach(sort @input_dir){ my $input = "$input_dir/$_"; my $outprotein = "$protein_dir/$_"; my $outgene = "$gene_dir/$_"; my $outtft = "$tft_dir/$_"; my $progress_record = int (($job_number/$sub_file_number)*100); $job_number++; my $genome_annotation_percent = int (($job_number/$sub_file_number)*100); print "$genome_annotation_percent%","..." if ($job_number == 1 or ( ($genome_annotation_percent%10 ==0) && ($progress_record <$genome_annotation_percent)) ); $runner->run( sub{ $outgene =~ s/.fasta$// if $prodigal_annotation eq "F"; $outtft =~ s/.tbl$//; system ("$prodigal -i $input -f gbk -g 11 -q -p meta -a $outprotein -d $outgene -o $outtft"); &gene_file($outprotein); &gene_file($outgene); &tbl_file($outtft); } )#run end } $runner->finish; print "done\n"; } ######################################################### ######################################################### sub bacth_blast_best_run { my ($workplace, $path_protein, $blastp_out_dir, $path_gene, $blastn_out_dir, $db_file, $mmseqs, $blastn, $makeblastdb, $thread_number, $homologous_gene_cutoff) = @_; my $blat = `which blat`; $blat =~ s/\n//g; my ($evalue, $identify, $coverage, $match_length) = split /,/, $homologous_gene_cutoff; #my $bestp_output = "$workplace/BESTp.BLASTOUT"; #my $all_vs_all_clusterp = "$workplace/all_vs_all.clusterp"; #my $best_output = "$workplace/BEST.BLASTOUT"; #my $all_vs_all_cluster = "$workplace/all_vs_all.cluster"; my $temp_dir = "$workplace/temp_dir"; mkdir $temp_dir; use File::Basename qw; my $db = $workplace."/".basename($db_file); system ("cp $db_file $db"); system ("$makeblastdb -in $db -dbtype nucl > $workplace/temp.txt"); #MMseqs database format my $targetDB = $workplace."/targetDB"; my $tmp = "$workplace/tmp"; if ($mmseqs =~/mmseqs/){ system("$mmseqs createdb $db $targetDB -v 0"); system("$mmseqs createindex $targetDB tmp --search-type 4 -v 0"); } opendir (PROTEIN_FOLDER, $path_protein); my @path_protein = readdir PROTEIN_FOLDER; @path_protein = grep ($_!~/^\./, @path_protein); closedir PROTEIN_FOLDER; opendir (GENE_FOLDER, $path_gene); my @path_gene = readdir GENE_FOLDER; @path_gene = grep ($_!~/^\./, @path_gene); closedir GENE_FOLDER; my $sub_file_number = scalar @path_gene; my $job_number=0; print " Blastn_percent: "; use Para::Runner; my $runner = Para::Runner->new($thread_number); foreach my $eachfile (@path_gene) { my $inputp = "$path_protein/$eachfile"; my $outfilep = "$blastp_out_dir/$eachfile.blastout"; my $parsedoutp = "$blastp_out_dir/$eachfile.parsedoutp"; my $input = "$path_gene/$eachfile"; my $outfile = "$blastn_out_dir/$eachfile.blastout"; my $parsedoutn = "$blastn_out_dir/$eachfile.parsedoutn"; my $parsedout = "$blastn_out_dir/$eachfile.parsedout"; my $temp_file = "$temp_dir/$eachfile.temp"; my $blat_out = "$temp_dir/$eachfile.blatout"; my $blat_query = "$workplace/Whole_fasta1/$eachfile"; $blat_query =~ s/.fasta$//; my $eachtft = "$workplace/Whole_TFT/$eachfile"; $eachtft =~ s/.fasta$/.tbl/; my $progress_record = int (($job_number/$sub_file_number)*100); $job_number++; my $Blastn_percent = int (($job_number/$sub_file_number)*100); print "$Blastn_percent%","..." if ($job_number == 1 or ( ($Blastn_percent%10 ==0) && ($progress_record <$Blastn_percent)) ); $runner->run( sub{ #****** Warining: this subrouine is only used to find the best hit for each query gene against database, and a loose cutoff for evalue my $cmd = "$blastn -outfmt '6 qseqid sseqid pident qlen slen length qstart qend sstart send evalue bitscore' -max_target_seqs 2 -evalue $evalue -num_threads 1 -query $input -db $db -out $outfile"; system("$cmd 2>/dev/null"); #not print warning on terminal &blast_filter ($outfile, $evalue, $identify, $coverage, $match_length, $parsedoutn, $temp_file); &extract_sequence($inputp, $parsedoutn); #$parsedoutn.sub.fasta, extracted protein sequence from blastn results my $cmdp; $cmdp = "$mmseqs -outfmt '6 qseqid sseqid pident qlen slen length qstart qend sstart send evalue bitscore qframe sframe' -max_target_seqs 2 -evalue $evalue -num_threads 1 -query $parsedoutn.sub.fasta -db $db -out $outfilep -db_gencode 11 -seg 'no'" if $mmseqs =~ /tblastn/; #using tblast $cmdp = "$mmseqs easy-search $parsedoutn.sub.fasta $targetDB $outfilep $tmp --min-aln-len 30 --threads 4 --max-seqs 2 --format-output query,target,pident,qlen,tlen,alnlen,qstart,qend,tstart,tend,evalue,bits --translation-table 11 -v 0" if $mmseqs =~ /mmseqs/; # --cov-mode 0 -c 0.95 --min-seq-id 0.7 system("$cmdp 2>/dev/null"); #not print warning on terminal &blast_filter ($outfilep, $evalue, $identify, $coverage, $match_length, $parsedoutp); &subtract_id ($parsedoutn, $parsedoutp, $parsedout); ###### process query genes that satisfy identity>=70% but not coverage>=95% &extract_sequence($db, $temp_file) if (stat ($temp_file))[7] > 0; #$temp_file.sub.fasta, extracted protein sequence from blastn results in case that system ("$blat $blat_query $temp_file.sub.fasta $blat_out -out=pslx -fastMap -noHead > $temp_dir/temp.txt") if (stat ($temp_file))[7] > 0; &blat_copy($db, $blat_query, $temp_dir, $temp_file, $makeblastdb, $mmseqs, $blat_out, $evalue, $identify, $coverage, $match_length, $eachfile, $eachtft, $parsedout) if (stat ($temp_file))[7] > 0; ###### end for process } )#run end } $runner->finish; print "done\n"; } sub blastn_small { my ($qy, $sj, $qsout, $makeblastdb, $mmseqs, $evalue) = @_; #system ("$makeblastdb -in $sj -dbtype nucl > $temp_dir/temp.txt"); my $cmdp = "$mmseqs -outfmt '6 qseqid sseqid pident qlen slen length qstart qend sstart send evalue bitscore qframe sframe' -max_target_seqs 2 -evalue $evalue -num_threads 1 -query $qy -db $sj -out $qsout -db_gencode 11 -seg 'no'"; #using tblastx #print "$cmdp\n"; system("$cmdp 2>/dev/null"); #not print warning on terminal } sub blat_copy { my ($db, $blat_query, $temp_dir, $temp_file, $makeblastdb, $tblastn, $outputfinal, $evalue, $identify, $coverage, $match_length, $eachfile, $eachtft, $parsedout) = @_; my $passif = "$temp_dir/$eachfile.passif.fasta"; my $passif_blastout = "$temp_dir/$eachfile.passif_blastout"; my $passif_blastoutn = "$temp_dir/$eachfile.passif_blastoutn"; my %record_pos; my %record_tft; my %record_gid; my %record_check; open (OT, $outputfinal); while (){ chomp; next unless $_ =~ /^\d/; $_ =~ s/ //g; my @array = split /\t/, $_; my $sum = $array[1]+$array[2]+$array[3]+$array[4]+$array[5]+$array[6]+$array[7]; if ($array[0]/$array[10]*100>=$coverage){ my $new_id = $array[9]; my $start_pos = $array[15] + 1; my $end_pos = $array[16]; my $frame = $array[8]; push @{$record_pos{$array[13]}}, "$new_id $start_pos $end_pos $frame"; my $glen = abs($end_pos - $start_pos) + 1; $new_id =~ s/\|.*//g; my ($gene_id, $annotation, $genome_name) = split /[;#]/, $new_id; #print "$gene_id\t$annotation\t$genome_name\n"; if ($frame eq "+"){ $record_tft{$array[16]."|".$array[13]} = "$start_pos\t$array[16]\tCDS\t$gene_id\t$annotation\t$array[13]"; $record_gid{$array[16]."|".$array[13]} = $gene_id; $record_check{$array[9]."|".$glen} = $array[16]."|".$array[13]; }else{ $record_tft{$start_pos."|".$array[13]} = "$array[16]\t$start_pos\tCDS\t$gene_id\t$annotation\t$array[13]"; $record_gid{$start_pos."|".$array[13]} = $gene_id; $record_check{$array[9]."|".$glen} = $start_pos."|".$array[13]; } } } close OT; &extract_seq_to_position($blat_query, "$passif.nucl", \%record_pos); &cds2pro("$passif.nucl", $passif, "$passif.bad_record"); &blastn_small($passif, $db, $passif_blastout, $makeblastdb, $tblastn, $evalue) if (stat ($passif))[7]>0; &blast_filter($passif_blastout, $evalue, $identify, $coverage, $match_length, $passif_blastoutn) if -e $passif_blastout; &modity_seqID_tft($passif_blastoutn, $eachtft, \%record_tft, \%record_gid, \%record_check, $parsedout) if -e $passif_blastoutn; } sub modity_seqID_tft { my ($hit, $eachtft, $ref_hash1, $ref_hash2, $ref_hash3, $parsedout) = @_; my %record_tft = %$ref_hash1; my %record_gid = %$ref_hash2; my %record_check = %$ref_hash3; my %tbl; open (ETFT, $eachtft); while(){ chomp; next unless $_ !~ /^>Contig/; my @tbl = split /\t/, $_; $tbl{$tbl[1]."|".$tbl[5]} = $tbl[3]; #keys: gene_stop_position."1".contig (e.g 8253|CAALZB010000020); values: geneid (e.g GCA_901222125.1_1987) } close ETFT; open (HIT, $hit); my %add_gene_row; #store the passed gene information for each genome while(){ chomp; next unless $_ !~ /^Qseqid Sseqid/; my @q = split /\t/, $_; my $lq = $q[8]*3; my $key_check = $record_check{$q[0]."|".$lq}; #print "$q[0]","|","$lq\n"; if ($record_gid{$key_check} eq $tbl{$key_check}){ system("sed -i 's/.*\t\Q$record_gid{$key_check}\E\t.*/\Q$record_tft{$key_check}\E/' $eachtft"); #revise tbl file in Whole_TFT folder to correct the gene call drawback by prodigal when geneid is identity in $passif_blastoutn and $eachtft my $qnew = $q[0]; $qnew =~ s/\|.*//g; my $snew = $q[1]; $snew =~ s/.*\|//g; my $lmatch = $q[7]*3; $add_gene_row{$qnew} = "$qnew\t$snew\t$q[2]\t$q[3]\t$q[4]\t$q[5]\t$q[6]\t$lmatch\t$lq\tremedy_by_blat_tblastn\n"; } } close HIT; #add the passed gene information into cps gene for each genome open (ADD, ">>$parsedout"); foreach (keys %add_gene_row){ print ADD $add_gene_row{$_}; } close ADD; system("sed -i 's/\|.*\|/\t/g' $hit"); } sub extract_seq_to_position { my ($in_file, $out_file, $ref) = @_; my %a = %$ref; my @c = keys %a; use Bio::SeqIO; my $in = Bio::SeqIO->new ( -file => $in_file, -format => 'fasta'); open (OUT, ">$out_file"); while (my $seq = $in->next_seq() ) { if (grep ($_ eq $seq->id, @c)){ foreach (@{$a{$seq->id}}){ my ($id, $start_pos, $end_pos, $frame) = split / /, $_; print OUT ">$id\n", $seq->trunc($start_pos, $end_pos)->seq, "\n" if $frame eq "+"; print OUT ">$id\n", &reverse_complement($seq->trunc($start_pos, $end_pos)->seq), "\n" if $frame eq "-"; } } } close OUT; } sub reverse_complement { my $infile = shift; use Bio::Seq; my $seqobj = Bio::Seq->new(-seq => $infile); my $Query = $seqobj->revcom()->seq; return $Query; } sub cds2pro { my ($cds, $pro, $bad_record) = @_; my %record_stop_codon; my $index=0; my %DNAfilename = parse_fasta($cds); open (OUT, ">$pro"); open (DR, ">$bad_record"); foreach my $key (keys %DNAfilename){ $index++; my $record_key = $key."||".$index; my $DNA = $DNAfilename{$key}; chomp $DNA; $DNA =~ s/\s//g; my $protein=''; my $codon; for(my $i=0;$i<(length($DNA)-2);$i+=3){ $codon=substr($DNA,$i,3); $protein.=&codon2aa($codon); push @{$record_stop_codon{$record_key}}, $i if codon2aa($codon) eq "*"; #print "$record_key\t$i\n" if codon2aa($codon) eq "*"; } my $stop_codon_num = scalar @{$record_stop_codon{$record_key}}; my $stop_codon_pos = join ("|", @{$record_stop_codon{$record_key}}); my $len_check = abs(length($DNA)/3-int(length($DNA)/3)); my $glen = length($DNA); if ( ($stop_codon_num eq 1) && ($len_check eq 0) ){ print OUT ">$key\n$protein\n"; }else{ print DR ">$key length: $glen stop_codon_number: $stop_codon_num stop_codon_position: $stop_codon_pos\n$protein\n"; } } close OUT; close DR; #return %record_stop_codon; } sub codon2aa{ my($codon)=@_; $codon=uc $codon; my(%g)=( 'AAA' => 'K', # Lysine 'AAC' => 'N', # Asparagine 'AAG' => 'K', # Lysine 'AAT' => 'N', # Asparagine 'ACA' => 'T', # Threonine 'ACC' => 'T', # Threonine 'ACG' => 'T', # Threonine 'ACT' => 'T', # Threonine 'AGA' => 'R', # Arginine 'AGC' => 'S', # Serine 'AGG' => 'R', # Arginine 'AGT' => 'S', # Serine 'ATA' => 'I', # Isoleucine 'ATC' => 'I', # Isoleucine 'ATG' => 'M', # Methionine 'ATT' => 'I', # Isoleucine 'CAA' => 'Q', # Glutamine 'CAC' => 'H', # Histidine 'CAG' => 'Q', # Glutamine 'CAT' => 'H', # Histidine 'CCA' => 'P', # Proline 'CCC' => 'P', # Proline 'CCG' => 'P', # Proline 'CCT' => 'P', # Proline 'CGA' => 'R', # Arginine 'CGC' => 'R', # Arginine 'CGG' => 'R', # Arginine 'CGT' => 'R', # Arginine 'CTA' => 'L', # Leucine 'CTC' => 'L', # Leucine 'CTG' => 'L', # Leucine 'CTT' => 'L', # Leucine 'GAA' => 'E', # Glutamic Acid 'GAC' => 'D', # Aspartic Acid 'GAG' => 'E', # Glutamic Acid 'GAT' => 'D', # Aspartic Acid 'GCA' => 'A', # Alanine 'GCC' => 'A', # Alanine 'GCG' => 'A', # Alanine 'GCT' => 'A', # Alanine 'GGA' => 'G', # Glycine 'GGC' => 'G', # Glycine 'GGG' => 'G', # Glycine 'GGT' => 'G', # Glycine 'GTA' => 'V', # Valine 'GTC' => 'V', # Valine 'GTG' => 'V', # Valine 'GTT' => 'V', # Valine 'TAA' => '*', # Stop 'TAC' => 'Y', # Tyrosine 'TAG' => '*', # Stop 'TAT' => 'Y', # Tyrosine 'TCA' => 'S', # Serine 'TCC' => 'S', # Serine 'TCG' => 'S', # Serine 'TCT' => 'S', # Serine 'TGA' => '*', # Stop 'TGC' => 'C', # Cysteine 'TGG' => 'W', # Tryptophan 'TGT' => 'C', # Cysteine 'TTA' => 'L', # Leucine 'TTC' => 'F', # Phenylalanine 'TTG' => 'L', # Leucine 'TTT' => 'F', # Phenylalanine 'aaa' => 'K', 'caa' => 'Q', 'gaa' => 'E', 'taa' => '*', 'aac' => 'N', 'cac' => 'H', 'gac' => 'D', 'tac' => 'Y', 'aag' => 'K', 'cag' => 'Q', 'gag' => 'E', 'tag' => '*', 'aat' => 'N', 'cat' => 'H', 'gat' => 'D', 'tat' => 'Y', 'aca' => 'T', 'cca' => 'P', 'gca' => 'A', 'tca' => 'S', 'acc' => 'T', 'ccc' => 'P', 'gcc' => 'A', 'tcc' => 'S', 'acg' => 'T', 'ccg' => 'P', 'gcg' => 'A', 'tcg' => 'S', 'act' => 'T', 'cct' => 'P', 'gct' => 'A', 'tct' => 'S', 'aga' => 'R', 'cga' => 'R', 'gga' => 'G', 'tga' => '*', 'agc' => 'S', 'cgc' => 'R', 'ggc' => 'G', 'tgc' => 'C', 'agg' => 'R', 'cgg' => 'R', 'ggg' => 'G', 'tgg' => 'W', 'agt' => 'S', 'cgt' => 'R', 'ggt' => 'G', 'tgt' => 'C', 'ata' => 'I', 'cta' => 'L', 'gta' => 'V', 'tta' => 'L', 'atc' => 'I', 'ctc' => 'L', 'gtc' => 'V', 'ttc' => 'F', 'atg' => 'M', 'ctg' => 'L', 'gtg' => 'V', 'ttg' => 'L', 'att' => 'I', 'ctt' => 'L', 'gtt' => 'V', 'ttt' => 'F' ); if(exists $g{uc $codon}) { return $g{uc $codon}; } else{ print STDERR "Bad codon \"$codon\"!!\n"; #exit; } } sub subtract_id { my ($nblist, $pblist, $blist) = @_; open (P, $pblist); my %p; while(

){ chomp; $_ =~ s/\t.*//g; $p{$_} = $_; } close P; open (N, $nblist); open (B, ">$blist"); while(){ chomp; my @cc = split /\t/, $_; print B "$_\n" if defined $p{$cc[0]}; print B "$_\n" if !defined $p{$cc[0]} && $cc[6] == 100.00; } close N; close B; } sub extract_sequence { my ($protein_file, $blastout_list) = @_; my %seq_hash = parse_fasta($protein_file); open (LIST, $blastout_list) or die "can not open $blastout_list"; my @array = ; open(OUT, ">$blastout_list.sub.fasta"); my @key = keys %seq_hash; foreach my $kk(@array){ chomp $kk; next unless $kk !~ /^Qseqid Sseqid/; $kk =~ /(.*?)\t(.*?)\t.*/; my $queryid = $2; if (grep(/$1/,@key)){ my @id = grep(/$1/,@key); print OUT ">$queryid|$id[0]\n$seq_hash{$id[0]}\n" if $blastout_list =~ /fasta.temp$/; # related Query gene id and subject gene id print OUT ">$id[0]\n$seq_hash{$id[0]}\n" if $blastout_list !~ /fasta.temp$/; } } #foreach my $kk(@array){ # chomp $kk; # next unless $kk !~ /^Qseqid Sseqid/; # $kk =~ s/\t(.*)//g; # print OUT ">$kk\n$seq_hash{$kk}\n"; #} close LIST; close OUT; } sub recheck_pseudogene_len{ my ($all_vs_all_cluster, $filter_pseudogene, $homologous_gene_cutoff) = @_; my ($evalue, $identify, $coverage, $match_length) = split /,/, $homologous_gene_cutoff; open (FPD, $filter_pseudogene); my %fpd; while(){ chomp; my @fpd = split /\t/, $_; $fpd{$fpd[0]} = $fpd[1]; } close FPD; open (AVA, $all_vs_all_cluster); my @store; while(){ chomp; #next unless $_ !~ /\t/; my @ava = split /\t/, $_; if ( (defined $fpd{$ava[1]}) && ($ava[8] < $fpd{$ava[1]}*$coverage/100) ){ #delete query that hit cps pseudogene whose length is <= normal_length * $coverage print "oooo: $_\t$fpd{$ava[1]}\n"; }else{ push @store, "$_\n"; } } close AVA; open (AVA, ">$all_vs_all_cluster"); print AVA join ("", @store); close AVA; } sub obtain_map_cmd { my ($map_cmd, $tref) = @_; my %options = %$tref; open (MAP_CMD, $map_cmd); my @cmd; while(){ chomp; next unless $_ =~ /^perl/; push @cmd, $_; } close MAP_CMD; my ($remap_cmd1, $remap_cmd2, $remap_cmd3); if (defined $options{phylogenetic_tree}){ $remap_cmd1 = $cmd[0]." -tree ".$options{phylogenetic_tree}; $remap_cmd2 = $cmd[1]." -tree ".$options{phylogenetic_tree}; $remap_cmd3 = $cmd[2]." -tree ".$options{phylogenetic_tree}; }elsif (defined $options{strain_reorder_file}){ $remap_cmd1 = $cmd[0]." -srf ".$options{strain_reorder_file}; $remap_cmd2 = $cmd[1]." -srf ".$options{strain_reorder_file}; $remap_cmd3 = $cmd[2]." -srf ".$options{strain_reorder_file}; }else{ $remap_cmd1 = $cmd[0]; $remap_cmd2 = $cmd[1]; $remap_cmd3 = $cmd[2]; } return $remap_cmd1, $remap_cmd2, $remap_cmd3; } 1; __END__