#! usr/bin/perl -w use strict; use warnings; use Getopt::Long; use FindBin; use File::Spec; use threads; use Bio::SeqIO; use File::Basename qw; #Before use, user should to install Aspera and set the absolute path for the following parameters #============================================================================================================================= #Aspera could be download from https://downloads.asperasoft.com/. my $ascp = '/home/xiangyang/.aspera/connect/bin/ascp'; # Aspera program my $asperaweb_id_dsa_openssh = '/home/xiangyang/.aspera/connect/etc/asperaweb_id_dsa.openssh'; # asperaweb_id_dsa.openssh abosolute pathway my $aspera_tokenauth_id_rsa = '/home/xiangyang/.aspera/connect/etc/aspera_tokenauth_id_rsa'; # aspera_tokenauth_id_rsa abosolute pathway my $home_directory = $FindBin::Bin; # obtaining the home directory where genome_batch_retrive.pl my $now_time = localtime; #Wed Aug 11 20:40:46 2021 $now_time =~ s/[ :]/_/g; my $usage = < undef, 'file_type' => "GBFF", 'genome_out_dir' => undef, 'threads' => "4", 'species' => undef, 'previous_list' => undef, 'wget' => "T", 'parallely_wget' => '1,1', #'10,5' split number and run number 'filter_list' => undef, 'help' => undef ); GetOptions( 'l|FTP_download_site_list=s' => \$options{FTP_download_site_list}, 'f|file_type=s' => \$options{file_type}, 'd|genome_out_dir=s' => \$options{genome_out_dir}, 't|threads=i' => \$options{threads}, 's|species=s' => \$options{species}, 'p|previous_list=s' => \$options{previous_list}, 'w|wget=s' => \$options{wget}, 'pw|parallely_wget=s' => \$options{parallely_wget}, #'10,5' split number and run number 'fl|filter_list=s' => \$options{filter_list}, 'h|help' => \$options{help} ); if (defined $options{help}) { print "$usage\n"; exit(0); } my $workplace; if ( defined( $options{genome_out_dir} ) ) { $workplace = File::Spec->rel2abs($options{genome_out_dir}); mkdir $workplace; }else { $workplace = "$home_directory/Result-$now_time"; $workplace =~ s/\/\//\//g; mkdir $workplace; } my $home_dir = $FindBin::Bin; my $FTP_download_site_list = $options{FTP_download_site_list}; my $species = $options{species}; my $final_list = "$workplace/final_list.txt"; my $inf = "$workplace/inf.txt"; my @full_information; open(FINAL_LIST, ">$final_list"); open(INF, ">$inf"); my %filter_ass; if (defined $options{filter_list}){ open (FILTER, $options{filter_list}); while(){ chomp; $filter_ass{$_} = $_; } close FILTER; } open(LIST, $FTP_download_site_list); #my %hash2; my @ar = ; my $h = shift @ar; $h = shift @ar; print FINAL_LIST $h; foreach(@ar){ chomp; my @genome_array = split "\t", $_; my $FTP_site = $genome_array[19]; #$FTP_site = $genome_array[15] if $genome_array[14] eq ""; if ($genome_array[7] =~ /\Q$species\E/){ #next unless $genome_array[11] =~ /^Complete Genome$/; #obtain only complete genomes $FTP_site =~ s/https:\/\/ftp.ncbi.nlm.nih.gov\///g; my $assembe_No = $FTP_site; $assembe_No =~ s/.*\/(GC[AF]_.*)/$1/g; next unless $assembe_No ne "na"; if (defined $options{filter_list}){ my $fassembe_No = $assembe_No; $fassembe_No =~ s/(GC[AF]_.*?)_.*/$1/g; next unless defined $filter_ass{$fassembe_No}; } push @full_information, "$_\n"; print FINAL_LIST "$FTP_site\n" if $options{file_type} eq "ALL"; print FINAL_LIST "$FTP_site/$assembe_No", "_genomic.gbff.gz\n" if $options{file_type} eq "GBFF"; print FINAL_LIST "$FTP_site/$assembe_No", "_genomic.fna.gz\n" if $options{file_type} eq "GS"; print FINAL_LIST "$FTP_site/$assembe_No", "_cds_from_genomic.fna.gz\n" if $options{file_type} eq "CDS_N"; print FINAL_LIST "$FTP_site/$assembe_No", "_protein.faa.gz\n" if $options{file_type} eq "CDS_A"; } } print INF join("", @full_information); #print FINAL_LIST join("\n", keys %hash2); close LIST; close FINAL_LIST; close INF; ########################################################################################### ###### update genomes when more genomes were found in NCBI genome database as time goes if ($options{wget} eq "T"){ my ($update_list, $update_sh, $update_gbk); if (defined $options{previous_list}){ ($update_list, $update_sh, $update_gbk) = update($options{previous_list}, $final_list); }else{ system("touch $workplace/old_list.txt"); ($update_list, $update_sh, $update_gbk) = update("$workplace/old_list.txt", $final_list); } $final_list = $update_list; #instead of $final_list using $update_list; if (defined $options{parallely_wget}){#split up_date.sh and download genomes using wget Parallely chdir $update_gbk; my ($spilt_number, $run_number) = split /,/, $options{parallely_wget}; &split_update_sh($home_dir, $update_sh, $spilt_number, $run_number); } } ###### ########################################################################################### my $gbk_folder = "$workplace/GBK_folder"; mkdir $gbk_folder; if ($options{wget} ne "T"){ my $cmd = "$ascp -i $asperaweb_id_dsa_openssh -i $aspera_tokenauth_id_rsa -P 22 -O 33001 -k 1 -T -l 200m --mode=recv --user=anonftp --host=ftp.ncbi.nlm.nih.gov --file-list=$final_list $gbk_folder"; system ($cmd); } chdir $gbk_folder; #system ("gzip -d *_genomic.gbff.gz"); #system ("rename 's/_genomic.gbff//' *"); my $path_fasta = "$workplace/G_SEQ"; mkdir $path_fasta; #batch_genbank_sequence_TFT_extract ($gbk_folder, $path_fasta, $options{threads}); #################################################### #################################################### sub batch_genbank_sequence_TFT_extract { my ($path, $fasta, $thread_number)=@_; opendir DIR, $path or die $!; my @dir = readdir DIR; @dir = grep ($_!~/^\./ ,@dir); closedir DIR; my @input; my @outfile; foreach my $file(@dir){ my $input="$path/$file"; push (@input,$input); my $output="$fasta/$file.fasta"; push (@outfile,$output); } my $sub_file_number = scalar @input; my $thread; my @threads; my $job_number=0; while(){ last if ($job_number eq $sub_file_number); while(scalar(threads->list())<$thread_number) { #set thread number $job_number++; my $input_file = $input[$job_number-1]; my $output_file = $outfile[$job_number-1]; print "Genbank_extraction: $job_number\n"; $threads[$job_number-1]=threads->new(\&genbank_fasta_sequence_extract, $input_file, $output_file); last if ($job_number eq $sub_file_number); } foreach $thread(threads->list(threads::all)){ if($thread->is_joinable()) { $thread->join(); } } } foreach my $th (threads->list(threads::all)) { $th->join(); } } #################################################### #################################################### sub genbank_fasta_sequence_extract { my ($input, $output)=@_; system ("gzip -d $input"); #decompress download gbk file $input =~ s/.gz$//g; #delete ".gz" from the file name my $new_name = basename $input; $new_name =~ s/_genomic.gbff//g; my $new_input = dirname($input)."/$new_name"; system ("mv $input $new_input"); my $new_output = dirname($output)."/$new_name.fasta"; open(OUTPUT, ">$new_output"); my $in = new Bio::SeqIO( -file => $new_input, -format => 'genbank' ); while (my $seqObject = $in->next_seq()){ my $acc = $seqObject->accession; my $desc = $seqObject->desc; my $dna = $seqObject->seq(); my $id = $seqObject->display_id(); my $DDD=print_sequence_into_file($dna, 60); # print_sequence_into_file($dna, 70); # 60 characters per line print OUTPUT ">$id\n$DDD\n"; } close OUTPUT; } ################################################# ################################################# # Print Comment Line (Header Line) # Print sequence line by calling print_sequence subroutine # subroutine print_sequence_into_file # print $sequence into $FILE with $length character per line sub print_sequence_into_file{ my($sequence, $length) = @_; my $DNA; my $string; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ){ $DNA=substr($sequence, $pos, $length)."\n"; $string.=$DNA; } return $string; } sub update { my ($old, $new) = @_; my $update = dirname($new)."/update_list.txt"; my $update_sh = dirname($new)."/update_list.sh"; my $update_gbk = dirname($new)."/update_gbk"; mkdir $update_gbk; open (OLD, $old); my %old; while (){ chomp; $old{$_} = $_; } close OLD; open (NEW, $new); my %new; open (UPDATE, ">$update"); open (UPDATE_SH, ">$update_sh"); while (){ next unless $_ !~ /^#/; chomp; $new{$_} = $_; my $fn = $_; $fn =~ s/.*\///g; print UPDATE "$_\n" if !defined $old{$_}; print UPDATE_SH "wget -O $update_gbk/$fn -c https://ftp.ncbi.nlm.nih.gov/$_;\n" if !defined $old{$_}; } close NEW; close UPDATE; my $remove = dirname($new)."/remove_list.txt"; open (REMOVE, ">$remove"); foreach (keys %old){ print REMOVE "$_\n" if !defined $new{$_}; } close REMOVE; return ($update, $update_sh, $update_gbk); } sub split_update_sh{ my ($home_dir, $update_sh, $spilt_number, $run_number) = @_; my $split_update_sh = "$workplace/split_update_sh"; mkdir $split_update_sh; system ("rm -rf $split_update_sh/*"); system ("bash $home_dir/splitDatabase.sh $update_sh $spilt_number $split_update_sh"); opendir(SL, $split_update_sh); my @sp = readdir SL; @sp = sort grep ($_ !~ /^\./, @sp); closedir SL; use Parallel::Runner; my $runner = Parallel::Runner->new($run_number); for (my $i=0; $i<$spilt_number; $i++) { my $sub_sh = "$split_update_sh/$sp[$i]"; $runner->run( sub{ system ("sh $sub_sh"); } )#run end } $runner->finish; }