# perl /home/xiangyang/Desktop/assembly/assemble_p.pl /home/xiangyang/Desktop/assembly/test 3 metaspades
# perl /media/xiangyang/test/assemble_p.pl /media/xiangyang/test/P1214_trim 16 metaspades

 
use strict;
use warnings;
use FindBin;
use File::Basename qw<basename dirname>;

my $home_directory = $FindBin::Bin;           # obtaining the home directory where type_strain_extract.pl
my $base = basename $ARGV[0]; 
my $workplace = "$home_directory/workplace_$base";
mkdir $workplace;

my $final_contig = "$workplace/final_contig";
mkdir $final_contig;

my $bin_dir = "$workplace/bin_dir";
mkdir $bin_dir;

my $final_bin = "$workplace/final_bin";
mkdir $final_bin;

chdir $workplace;

opendir (SRA, $ARGV[0]);
my @sra = readdir SRA;
@sra = grep($_ !~ /^\./, @sra);
@sra = grep($_ !~ /[rR]2.fastq.gz$/, @sra);
close SRA;


foreach my $f (@sra) { #PHESPD0013.R2.fastq.gz  P663_10_1.fq.gz
    next unless $f =~ /(.*)([._])(([rR])?)1(.fastq|.fq)(.gz)?$/;
    $b = $1;
    my $ff1 = $1.$2.$3."1".$5.$6;
    my $ff2 = $1.$2.$3."2".$5.$6;

    my $f1 = $1.$2.$3."1".$5;
    my $f2 = $1.$2.$3."2".$5;

    my $p1 = "$b"."_1".".fastp";
    my $p2 = "$b"."_2".".fastp";

    system ("gzip -dk $ARGV[0]/$ff1");
    system ("gzip -dk $ARGV[0]/$ff2");
    system("mv $ARGV[0]/$f1 -t $workplace");
    system("mv $ARGV[0]/$f2 -t $workplace");

    my $threads = $ARGV[1];

###filter metedata using fastp
    system("$home_directory/fastp -i $f1 -I $f2 -o $p1.fastq -O $p2.fastq -w 16 -j $workplace/$b.jason -h $workplace/$b.html >> fastp.log"); 

###assemble genome using megahit
    system("megahit  --presets meta-large -t 30 -1 $p1.fastq -2 $p2.fastq -o ./megahit_$b --out-prefix $b --min-contig-len 200") if $ARGV[2] eq "megahit"; 
    system("cp megahit_$b/$b.contigs.fa -t $final_contig") if $ARGV[2] eq "megahit";

###assemble genome using metaspades    
    system("metaspades.py -1 $p1.fastq -2 $p2.fastq -o metaspades_$b -t $threads") if $ARGV[2] eq "metaspades";
    system("cp metaspades_$b/contigs.fasta $final_contig/$b.contigs.fa") if $ARGV[2] eq "metaspades";

###bin genome using metabat2   
    &bin($bin_dir, $final_contig, $b, "$p1.fastq", "$p2.fastq"); #current workplace is $workplace
    system("cp $bin_dir/$b.bin* -t $final_bin");
}


### summerize each assembled genomes and modified the fasta-id name by delete the content after blank
&trans($final_contig);
&trans($final_bin);




sub bin { #current workplace is $workplace

    my ($bin_dir, $fcontig, $fname, $fastq1, $fastq2) = @_;

        #obtain contigs_fa
        system("bowtie2-build --threads 30 $fcontig/$fname.contigs.fa $bin_dir/$fname.contigs_fa");

        #obtain contigs.sam
        system("bowtie2 -p 40 --local --very-sensitive-local -x $bin_dir/$fname.contigs_fa -1 $fastq1 -2 $fastq2 -S $bin_dir/$fname.contigs.sam");

        #obtain contigs.sort.bam
        system("samtools sort --threads 46 $bin_dir/$fname.contigs.sam -o $bin_dir/$fname.contigs.sort.bam");

        #obtain depth.txt
        system("jgi_summarize_bam_contig_depths --outputDepth $bin_dir/$fname.depth.txt $bin_dir/$fname.contigs.sort.bam");

        #obtain bin
        system("metabat2 -i $fcontig/$fname.contigs.fa -a $bin_dir/$fname.depth.txt -o $bin_dir/$fname.bin -t 46 -m 1500"); 
    

}


sub trans {
    my $f = shift;
    opendir(F, $f);
    my @f = readdir F;
    @f = grep($_ !~ /^\./, @f);

    close F;

    my $r = "$f/reports.txt";
    open (R, ">$r");
    print R "Genome\tSize\tContig_number\tMin_contig\tMax_contig\tN50\n";

    foreach my $fe(@f){
        my %h = parse_fasta("$f/$fe");
        my $of = "$f/$fe";
        open(O, ">$of");
        my @fe;
        my $len;
        foreach (keys %h){
            my $id = $_;
            $id =~ s/ .*//g;
            my $D = print_sequence_into_file($h{$_}, 60);  # print_sequence_into_file($dna, 70);
            print O ">$id\n$D\n" if length($h{$_}) >= 200;
            push @fe, length($h{$_});
            $len +=length($h{$_});
        }
        @fe = sort{$a<=>$b}@fe;
        my $N50;
        my $a;
        foreach (@fe){
            $a += $_;  
            $N50 = $_ if $a >= int($len/2); 
            last if $a >= int($len/2);
        }
        my $contig_num = scalar @fe;
   
        print R "$fe\t$len\t$contig_num\t$fe[0]\t$fe[-1]\t$N50\n";

        close O;
    }
    close R;
}



sub parse_fasta {

    my ($infile) = @_;
    my $term     = $/; # input record separator;
    my %seq_hash = (); # key:seqid, val:seq
    open my $INFILE, $infile or die "could not open infile '$infile' : $! \n"; 
    $/ = ">"; # input record separator;
    while(<$INFILE>) {
        chomp;
        next if($_ eq '');
        my ($id, @sequencelines) = split /\n/;
        foreach my $line (@sequencelines) {
            $seq_hash{$id} .= $line;
        }
    }
    $/ = $term;
    return(%seq_hash);

} # end of parse_fasta



# print $sequence into $FILE with $length character per line
sub print_sequence_into_file{   

    my($sequence, $length) = @_;
    my $DNA;
    my $string;
    # Print sequence in lines of $length
    for (my $pos = 0; $pos < length($sequence); $pos += $length){   
        $DNA=substr($sequence, $pos, $length)."\n";
        $string.=$DNA;        
    }

    return $string;

}
