#!usr/bin/perl -w

use strict;
use warnings;
use FindBin;
use File::Basename qw<basename dirname>;

#perl /home/xiangyang/moxiu/TypeStrain_ANI/assembly_ftpsite_gbk.pl /home/xiangyang/moxiu/TypeStrain_ANI/assembly_summary_genbank-20230610.txt

my $sum = $ARGV[0];   # protein sequences for each genome under a fold

my $home_directory = $FindBin::Bin;           # obtaining the home directory where type_strain_extract.pl
my $out_total1 = "$home_directory/assembly_FTPsite.txt";
my $out_total2 = "$home_directory/Typestrain_assembly_FTPsite.txt";
    open (OUT_1, ">$out_total1");
    open (OUT_2, ">$out_total2");
    open (SUM, $sum);
    my @sum_list = <SUM>;
    foreach my $sum_list (@sum_list){
        chomp $sum_list;
        next unless $sum_list !~ /^#/;
        my @arr = split /\t/, $sum_list;
        my $organism_name = $arr[7];

        my $infraspecific_name;
        if ( ($arr[8] eq "") or ($arr[8] eq "na") ){
            $infraspecific_name = "blank";
        }else{

            $infraspecific_name = $arr[8];
            $infraspecific_name =~ s/.*?\=//g;
        }

        my $isolate;
        if ( ($arr[9] eq "") or ($arr[9] eq "na") ){
            $isolate = "blank";
        }else{
            $isolate = $arr[9];
        }
        my $fullname = $organism_name;
        $fullname = $organism_name." ".$infraspecific_name if ( ($organism_name !~ /\Q$infraspecific_name\E/) && ($infraspecific_name ne "blank") );
        $fullname = $organism_name." ".$isolate if ( ($organism_name !~ /\Q$isolate\E/) && ($isolate ne "blank") );
        my $assembly_no = $arr[19];
        $assembly_no =~ s/.*\///g;  
        $assembly_no =~ /(GC[AF]_.*?)_.*/;
        print OUT_1 "$1\t$assembly_no\t$fullname\t$arr[19]/$assembly_no", "_genomic.gbff.gz\n"; #_genomic.gbff.gz gbk
        print OUT_2 "$1\t$assembly_no\t$fullname\t$arr[19]/$assembly_no", "_genomic.fna.gz\n" if $arr[21] eq "assembly from type material";
    }

    close SUM;
    close OUT_1;
    close OUT_2;




