# perl /home/xiangyang/Desktop/assembly/assemble_p_bacth.pl /home/xiangyang/Desktop/assembly/test 3 metaspades
# perl /media/xiangyang/test/assemble_p_bacth.pl /media/xiangyang/test/P1214_trim 16 metaspades
# perl /home/xiangyang/moxiu/bin/AssemblyBin/AssemblePipeline/assemble_p_bacth.pl /home/xiangyang/cao/zhangyan 17 metaspades
 
use strict;
use warnings;
use FindBin;
use File::Basename qw<basename dirname>;

my $home_directory = $FindBin::Bin;           # obtaining the home directory where type_strain_extract.pl
my $base = basename $ARGV[0]; 
my $workplace = "$home_directory/workplace_$base";
mkdir $workplace;

my $sh_dir = "$workplace/sh_dir";
mkdir $sh_dir;

my $final_contig = "$workplace/final_contig";
mkdir $final_contig;

my $bin_dir = "$workplace/bin_dir";
mkdir $bin_dir;

my $final_bin = "$workplace/final_bin";
mkdir $final_bin;

chdir $workplace;

opendir (SRA, $ARGV[0]);
my @sra = readdir SRA;
@sra = grep($_ !~ /^\./, @sra);
@sra = grep($_ !~ /[rR]2.fastq.gz$/, @sra);
close SRA;

use Parallel::Runner;
my $runner = Parallel::Runner->new($ARGV[1]);


foreach my $f (@sra) {
    next unless $f =~ /(.*)([._])(([rR])?)1(.fastq|.fq)(.gz)?$/; #PHESPD0013.R2.fastq.gz
    $b = $1;
    my $ff1 = $1.$2.$3."1".$5.$6;
    my $ff2 = $1.$2.$3."2".$5.$6;

    my $f1 = $1.$2.$3."1".$5;
    my $f2 = $1.$2.$3."2".$5;

    my $p1 = "$b"."_1".".fastp";
    my $p2 = "$b"."_2".".fastp";

    open (SH, ">$sh_dir/$b.sh");
    #my $threads = $ARGV[1]*5;
    $runner->run(
        sub{
            my $cmd1 = "gzip -dk $ARGV[0]/$ff1";
            my $cmd2 = "gzip -dk $ARGV[0]/$ff2";
            my $cmd3 = "mv $ARGV[0]/$f1 -t $workplace";
            my $cmd4 = "mv $ARGV[0]/$f2 -t $workplace";
            print SH "$cmd1\n$cmd2\n$cmd3\n$cmd4\n";
            
            my $cmd5 = "$home_directory/fastp -i $f1 -I $f2 -o $p1.fastq -O $p2.fastq -w 8 -j $workplace/$b.jason -h $workplace/$b.html >> fastp.log";
            print SH "###filter metedata using fastp\n";
            print SH "$cmd5\n";
            
            my ($cmd6, $cmd7);
            $cmd6 = "megahit  --presets meta-large -t 30 -1 $p1.fastq -2 $p2.fastq -o ./megahit_$b --out-prefix $b --min-contig-len 200" if $ARGV[2] eq "megahit";
            $cmd7 = "cp megahit_$b/$b.contigs.fa -t $final_contig" if $ARGV[2] eq "megahit";
            $cmd6 = "metaspades.py -1 $p1.fastq -2 $p2.fastq -o metaspades_$b -t 16" if $ARGV[2] eq "metaspades"; 
            $cmd7 = "cp metaspades_$b/contigs.fasta $final_contig/$b.contigs.fa" if $ARGV[2] eq "metaspades";
            print SH "###assemble genome using megahit\n" if $ARGV[2] eq "megahit";
            print SH "###assemble genome using metaspades\n" if $ARGV[2] eq "metaspades";
            print SH "$cmd6\n$cmd7\n";
            
            #system("$cmd1");
            #system("$cmd2");
            #system("$cmd3");
            #system("$cmd4");

            ###filter metedata using fastp
            #system("$cmd5"); 

            ###assemble genome using megahit
            #system("$cmd6") if $ARGV[2] eq "megahit"; 
            #system("$cmd7") if $ARGV[2] eq "megahit";

            ###assemble genome using metaspades   
	    #system("$cmd6") if $ARGV[2] eq "metaspades";
	    #system("$cmd7") if $ARGV[2] eq "metaspades";

            ###bin genome using metabat2   
	    my $str_cmd = &bin($bin_dir, $final_contig, $b, "$p1.fastq", "$p2.fastq"); #current workplace is $workplace
	    print SH "###bin genome using metabat2\n$str_cmd";
	    
	    my $cmd13 = "cp $bin_dir/$b.bin* -t $final_bin";
	    #system("$cmd13");
	    
	    ###delete files
	    my $cmd14 = "rm -rf $f1";
	    my $cmd15 = "rm -rf $f2";
	    print SH "$cmd13\n###delete files\n$cmd14\n$cmd15\n";
	    close SH;
	    #system("$cmd14");
	    #system("$cmd15");
	    system("bash $sh_dir/$b.sh"); 
        }
    )
}

$runner->finish;
### summerize each assembled genomes and modified the fasta-id name by delete the content after blank
&trans($final_contig);
&trans($final_bin);




sub bin { #current workplace is $workplace

    my ($bin_dir, $fcontig, $fname, $fastq1, $fastq2) = @_;
    
        my $cmd8 = "bowtie2-build --threads 16 $fcontig/$fname.contigs.fa $bin_dir/$fname.contigs_fa"; 
        my $cmd9 = "bowtie2 -p 16 --local --very-sensitive-local -x $bin_dir/$fname.contigs_fa -1 $fastq1 -2 $fastq2 -S $bin_dir/$fname.contigs.sam";
        my $cmd10 = "samtools sort --threads 16 $bin_dir/$fname.contigs.sam -o $bin_dir/$fname.contigs.sort.bam";
        my $cmd11 = "jgi_summarize_bam_contig_depths --outputDepth $bin_dir/$fname.depth.txt $bin_dir/$fname.contigs.sort.bam";
        my $cmd12 = "metabat2 -i $fcontig/$fname.contigs.fa -a $bin_dir/$fname.depth.txt -o $bin_dir/$fname.bin -t 16 -m 1500";
        my $str_cmd = "#obtain contigs_fa\n$cmd8\n#obtain contigs.sam\n$cmd9\n#obtain contigs.sort.bam\n$cmd10\n#obtain depth.txt\n$cmd11\n#obtain bin\n$cmd12\n";
        return $str_cmd;
        #obtain contigs_fa
        #system("$cmd8");

        #obtain contigs.sam
        #system("$cmd9");

        #obtain contigs.sort.bam
        #system("$cmd10");

        #obtain depth.txt
        #system("$cmd11");

        #obtain bin
        #system("$cmd12"); 
    

}


sub trans {
    my $f = shift;
    opendir(F, $f);
    my @f = readdir F;
    @f = grep($_ !~ /^\./, @f);

    close F;

    my $r = "$f/reports.txt";
    open (R, ">$r");
    print R "Genome\tSize\tContig_number\tMin_contig\tMax_contig\tN50\n";

    foreach my $fe(@f){
        my %h = parse_fasta("$f/$fe");
        my $of = "$f/$fe";
        open(O, ">$of");
        my @fe;
        my $len;
        foreach (keys %h){
            my $id = $_;
            $id =~ s/ .*//g;
            my $D = print_sequence_into_file($h{$_}, 60);  # print_sequence_into_file($dna, 70);
            print O ">$id\n$D\n" if length($h{$_}) >= 200;
            push @fe, length($h{$_});
            $len +=length($h{$_});
        }
        @fe = sort{$a<=>$b}@fe;
        my $N50;
        my $a;
        foreach (@fe){
            $a += $_;  
            $N50 = $_ if $a >= int($len/2); 
            last if $a >= int($len/2);
        }
        my $contig_num = scalar @fe;
   
        print R "$fe\t$len\t$contig_num\t$fe[0]\t$fe[-1]\t$N50\n";

        close O;
    }
    close R;
}



sub parse_fasta {

    my ($infile) = @_;
    my $term     = $/; # input record separator;
    my %seq_hash = (); # key:seqid, val:seq
    open my $INFILE, $infile or die "could not open infile '$infile' : $! \n"; 
    $/ = ">"; # input record separator;
    while(<$INFILE>) {
        chomp;
        next if($_ eq '');
        my ($id, @sequencelines) = split /\n/;
        foreach my $line (@sequencelines) {
            $seq_hash{$id} .= $line;
        }
    }
    $/ = $term;
    return(%seq_hash);

} # end of parse_fasta



# print $sequence into $FILE with $length character per line
sub print_sequence_into_file{   

    my($sequence, $length) = @_;
    my $DNA;
    my $string;
    # Print sequence in lines of $length
    for (my $pos = 0; $pos < length($sequence); $pos += $length){   
        $DNA=substr($sequence, $pos, $length)."\n";
        $string.=$DNA;        
    }

    return $string;

}
