#! usr/bin/perl -w

use strict;
use warnings;
use Getopt::Long;
use FindBin;
use File::Spec;
use threads;
use Bio::SeqIO;
use File::Basename qw<basename dirname>;


#Before use, user should to install Aspera and set the absolute path for the following parameters
#=============================================================================================================================
#Aspera could be download from https://downloads.asperasoft.com/.
my $ascp = '/home/xiangyang/.aspera/connect/bin/ascp';                                         # Aspera program
my $asperaweb_id_dsa_openssh = '/home/xiangyang/.aspera/connect/etc/asperaweb_id_dsa.openssh'; # asperaweb_id_dsa.openssh abosolute pathway
my $aspera_tokenauth_id_rsa = '/home/xiangyang/.aspera/connect/etc/aspera_tokenauth_id_rsa';   # aspera_tokenauth_id_rsa abosolute pathway

my $home_directory = $FindBin::Bin;           # obtaining the home directory where genome_batch_retrive.pl

my $now_time = localtime;
#Wed Aug 11 20:40:46 2021
$now_time =~ s/[ :]/_/g;

my $usage = <<USAGE; 

=NAME

genome_batch_retrive.pl

=DESCRIPTION

This perlsript is used to retrive a batch of genome files from NCBI genome database using Aspera, a High-speed file transfer software

=USAGE

genomebacth_retrive_taxonomy.pl -l FTP_download_site_list -d genome_out_dir -s species

FOR EXAMPLE: 
perl 1.genomebacth_retrive_taxonomy.pl -l assembly_summary_genbank-20230731.txt -f GBFF -t 3 -s "Pseudomonas aeruginosa" -p final_list_pa.txt -d update_pa -pw '10,5'

perl 1.genomebacth_retrive_taxonomy.pl -l assembly_summary_genbank-20230731.txt -f GBFF -t 3 -s "Pseudomonas aeruginosa" -p final_list_pa.txt -d update_pa -pw '10,5' -p previous_list.txt

=ARGUMENTS
=======================
    -l, --FTP_download_site_list
        A text file, in which each line contains the FTP download website for one genome. User can obtain a batch of genomic download websites from http://www.ncbi.nlm.nih.gov/genome/browse/. 

           Example for explaination file format for individual genome data
           ---------------------------------------------------------------------------------------------------------------
             file_format                                                                               explaination
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_cds_from_genomic.fna.gz             基因核苷酸序列
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_feature_table.txt.gz
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_genomic.fna.gz                      基因组fasta序列
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_genomic.gbff.gz                     完整序列加注释信息
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_genomic.gff.gz
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_protein.faa.gz                      基因蛋白质序列
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_protein.gpff.gz              
             GCF_000429385.1_ASM42938v1/GCF_000429385.1_ASM42938v1_rna_from_genomic.fna.gz
           ----------------------------------------------------------------------------------------------------------------
   
    -d, --genome_out_dir
        a directory to hold download genomes data. 
    -f, --file_type
        All format files (ALL), Full GeneBank (GBFF), genomic sequence (GS), CDS nucleotide sequences (CDS_N), and CDS  amino acid sequences(CDS_A).  
            *GBFF: _genomic.gbff.gz, GenBank flat file format of the genomic sequence(s) in the  assembly.  
            *GS: _genomic.fna.gz, FASTA format of the genomic sequence(s) in the assembly. 
            *CDS_N: _cds_from_genomic.fna.gz, FASTA sequences of individual features annotated on the genomic records.
            *CDS_A: _protein.faa.gz, FASTA format sequences of the accessioned protein products annotated on the genome assembly.                    
    -h, --help
        Show this message.

=AUTHOR

Dr. Xiangyang Li (E-mail: lixiangyang\@fudan.edu.cn), Fudan university; Kaili University; Bacterial Genome Data mining & Bioinformatic Analysis (www.microbialgenomic.com/).

=COPYRIGHT

Copyright 2019, Xiangyang Li. All Rights Reserved.

USAGE


my %options = (
    'FTP_download_site_list'              => undef,   
    'file_type'                           => "GBFF",     
    'genome_out_dir'                      => undef,   
    'threads'                             => "4",   
    'species'                             => undef,
    'previous_list'                       => undef,
    'wget'                                => "T", 
    'parallely_wget'                      => '1,1', #'10,5' split number and run number
    'filter_list'                         => undef,
    'help'                                => undef
);
 

GetOptions(
    'l|FTP_download_site_list=s'              => \$options{FTP_download_site_list},  
    'f|file_type=s'                           => \$options{file_type},  
    'd|genome_out_dir=s'                      => \$options{genome_out_dir}, 
    't|threads=i'                             => \$options{threads}, 
    's|species=s'                             => \$options{species}, 
    'p|previous_list=s'                       => \$options{previous_list},
    'w|wget=s'                                => \$options{wget}, 
    'pw|parallely_wget=s'                     => \$options{parallely_wget}, #'10,5' split number and run number
    'fl|filter_list=s'                        => \$options{filter_list}, 
    'h|help'                                  => \$options{help}
);

if (defined $options{help}) {
    print "$usage\n";
    exit(0);
}

my $workplace;
if ( defined( $options{genome_out_dir} ) ) {
    $workplace = File::Spec->rel2abs($options{genome_out_dir});
    mkdir $workplace;
}else {

    $workplace = "$home_directory/Result-$now_time";
    $workplace =~ s/\/\//\//g;
    mkdir $workplace;
}

my $home_dir = $FindBin::Bin;
my $FTP_download_site_list = $options{FTP_download_site_list}; 
my $species = $options{species}; 
my $final_list = "$workplace/final_list.txt";
my $inf = "$workplace/inf.txt";
my @full_information;
open(FINAL_LIST, ">$final_list");
open(INF, ">$inf");

my %filter_ass;
if (defined $options{filter_list}){
    open (FILTER, $options{filter_list});
    while(<FILTER>){
        chomp;
        $filter_ass{$_} = $_;
    }
    close FILTER;
}

open(LIST, $FTP_download_site_list);
#my %hash2;
my @ar = <LIST>;
my $h = shift @ar;
$h = shift @ar;
print FINAL_LIST $h;

foreach(@ar){
    chomp;
    my @genome_array = split "\t", $_;
    my $FTP_site = $genome_array[19];
    #$FTP_site = $genome_array[15] if $genome_array[14] eq "";
    if ($genome_array[7] =~ /\Q$species\E/){

	#next unless $genome_array[11] =~ /^Complete Genome$/; #obtain only complete genomes
        $FTP_site =~ s/https:\/\/ftp.ncbi.nlm.nih.gov\///g;
        my $assembe_No = $FTP_site;
        $assembe_No =~ s/.*\/(GC[AF]_.*)/$1/g;
        next unless $assembe_No ne "na";
        
        if (defined $options{filter_list}){
            my $fassembe_No = $assembe_No;        
            $fassembe_No =~ s/(GC[AF]_.*?)_.*/$1/g;
            next unless defined $filter_ass{$fassembe_No};
        }
        
	push @full_information, "$_\n";

        print FINAL_LIST "$FTP_site\n" if $options{file_type} eq "ALL";
        print FINAL_LIST "$FTP_site/$assembe_No", "_genomic.gbff.gz\n" if $options{file_type} eq "GBFF";
        print FINAL_LIST "$FTP_site/$assembe_No", "_genomic.fna.gz\n" if $options{file_type} eq "GS";
        print FINAL_LIST "$FTP_site/$assembe_No", "_cds_from_genomic.fna.gz\n" if $options{file_type} eq "CDS_N";
        print FINAL_LIST "$FTP_site/$assembe_No", "_protein.faa.gz\n" if $options{file_type} eq "CDS_A";

    }

}

print INF join("", @full_information);
#print FINAL_LIST join("\n", keys %hash2);

close LIST;
close FINAL_LIST;
close INF;
###########################################################################################
###### update genomes when more genomes were found in NCBI genome database as time goes
if ($options{wget} eq "T"){
    my ($update_list, $update_sh, $update_gbk);
    if (defined $options{previous_list}){
        ($update_list, $update_sh, $update_gbk) = update($options{previous_list}, $final_list);
    }else{
        system("touch $workplace/old_list.txt");
        ($update_list, $update_sh, $update_gbk) = update("$workplace/old_list.txt", $final_list);
    }
    $final_list = $update_list; #instead of $final_list using $update_list;
    if (defined $options{parallely_wget}){#split up_date.sh and download genomes using wget Parallely
	chdir $update_gbk;
	my ($spilt_number, $run_number) = split /,/, $options{parallely_wget};
        &split_update_sh($home_dir, $update_sh, $spilt_number, $run_number);	
    }
}
######
###########################################################################################

my $gbk_folder = "$workplace/GBK_folder";
mkdir $gbk_folder;

if ($options{wget} ne "T"){
    my $cmd = "$ascp -i $asperaweb_id_dsa_openssh -i $aspera_tokenauth_id_rsa -P 22 -O 33001 -k 1 -T -l 200m --mode=recv --user=anonftp --host=ftp.ncbi.nlm.nih.gov --file-list=$final_list $gbk_folder";
    system ($cmd);
}
chdir $gbk_folder;

#system ("gzip -d *_genomic.gbff.gz");
#system ("rename 's/_genomic.gbff//' *");

my $path_fasta = "$workplace/G_SEQ";
mkdir $path_fasta;

#batch_genbank_sequence_TFT_extract ($gbk_folder, $path_fasta, $options{threads});

####################################################
####################################################
sub batch_genbank_sequence_TFT_extract {

    my ($path, $fasta, $thread_number)=@_;

    opendir DIR, $path or die $!;
    my @dir = readdir DIR; 
    @dir = grep ($_!~/^\./ ,@dir);
    closedir DIR;

    my @input;
    my @outfile;

    foreach my $file(@dir){
        my $input="$path/$file";
        push (@input,$input);  

        my $output="$fasta/$file.fasta";
        push (@outfile,$output);
    }

    my $sub_file_number = scalar @input;
    my $thread;
    my @threads;
    my $job_number=0;

    while(){ 

        last if ($job_number eq $sub_file_number);  

        while(scalar(threads->list())<$thread_number) {     #set thread number
            $job_number++;    
            my $input_file = $input[$job_number-1];
            my $output_file = $outfile[$job_number-1];
            print "Genbank_extraction: $job_number\n";
            $threads[$job_number-1]=threads->new(\&genbank_fasta_sequence_extract, $input_file, $output_file); 

            last if ($job_number eq $sub_file_number);  
        }

        foreach $thread(threads->list(threads::all)){
            if($thread->is_joinable()) {              
                $thread->join();                     
            }
        }
    }

    foreach my $th (threads->list(threads::all)) {
        $th->join();
    }

}


####################################################
####################################################
sub genbank_fasta_sequence_extract {

    my ($input, $output)=@_;
    system ("gzip -d $input");  #decompress download gbk file
    $input =~ s/.gz$//g;  #delete ".gz" from the file name 
    my $new_name = basename $input;
    $new_name =~ s/_genomic.gbff//g;
    my $new_input = dirname($input)."/$new_name";
    system ("mv $input $new_input");
    my $new_output = dirname($output)."/$new_name.fasta";
    open(OUTPUT, ">$new_output");
    my $in = new Bio::SeqIO( -file => $new_input, -format => 'genbank' );
    while (my $seqObject = $in->next_seq()){
        my $acc = $seqObject->accession;
        my $desc = $seqObject->desc;
        my $dna = $seqObject->seq();
        my $id = $seqObject->display_id();
        my $DDD=print_sequence_into_file($dna, 60);  # print_sequence_into_file($dna, 70);  # 60 characters per line

        print OUTPUT ">$id\n$DDD\n";
       
    }

    close OUTPUT;

}



#################################################
#################################################
# Print Comment Line (Header Line)


# Print sequence line by calling print_sequence subroutine


# subroutine print_sequence_into_file
# print $sequence into $FILE with $length character per line
sub print_sequence_into_file{   

    my($sequence, $length) = @_;
    my $DNA;
    my $string;
    # Print sequence in lines of $length
    for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ){   
        $DNA=substr($sequence, $pos, $length)."\n";
        $string.=$DNA;        
    }
    return $string;
}


sub update {
    my ($old, $new) = @_;

    my $update = dirname($new)."/update_list.txt";
    my $update_sh = dirname($new)."/update_list.sh";
    my $update_gbk = dirname($new)."/update_gbk";
    mkdir $update_gbk;

    open (OLD, $old);
    my %old;
    while (<OLD>){
        chomp;
        $old{$_} = $_;
    }
    close OLD;

    open (NEW, $new);
    my %new;
    open (UPDATE, ">$update");
    open (UPDATE_SH, ">$update_sh");
    while (<NEW>){
	next unless $_ !~ /^#/;
        chomp;
        $new{$_} = $_;
	my $fn = $_;
	$fn =~ s/.*\///g;
        print UPDATE "$_\n" if !defined $old{$_};
	print UPDATE_SH "wget -O $update_gbk/$fn -c https://ftp.ncbi.nlm.nih.gov/$_;\n" if !defined $old{$_};
    }
    close NEW;
    close UPDATE;

    my $remove = dirname($new)."/remove_list.txt";
    open (REMOVE, ">$remove");
    foreach (keys %old){
        print REMOVE "$_\n" if !defined $new{$_};
    }
    close REMOVE;

    return ($update, $update_sh, $update_gbk);
}

sub split_update_sh{
    my ($home_dir, $update_sh, $spilt_number, $run_number) = @_; 
    my $split_update_sh = "$workplace/split_update_sh";
    mkdir $split_update_sh;
    system ("rm -rf $split_update_sh/*");
    system ("bash $home_dir/splitDatabase.sh $update_sh $spilt_number $split_update_sh");

    opendir(SL, $split_update_sh);
    my @sp = readdir SL;
    @sp = sort grep ($_ !~ /^\./, @sp);
    closedir SL;

    use Parallel::Runner;
    my $runner = Parallel::Runner->new($run_number);
    for (my $i=0; $i<$spilt_number; $i++) {
        my $sub_sh = "$split_update_sh/$sp[$i]";
        $runner->run(
            sub{
                system ("sh $sub_sh");
            }
        )#run end

    }
    $runner->finish;

}
