#perl /home/xiangyang/Bacth_SNP-20200102/Core-SNP_tool/alignment_snpExtract.pl /home/xiangyang/Bacth_SNP-20200102/Core-SNP_tool/T 2

use strict;
use warnings;
use Bio::SeqIO;
use File::Basename qw<basename dirname>;
use FindBin;
use lib "$FindBin::Bin/lib";
use File::Spec;
use threads;

my $out_gene_folder = File::Spec->rel2abs($ARGV[0]);
my $thread_number = $ARGV[1];

    print "\nStep-1: Alignment each of the core gene set";
    my $alignment_out_folder = alignment_muscle($out_gene_folder, $thread_number);
    print "\nStep-2: Concatenate snp\n";
    my $SNP_dir = Extract_snp_alignment_dir($alignment_out_folder);




sub go_up_folder {

    my $current_path = shift;
    $current_path =~ s/(.*)\///; 
    my $up_folder = $1;
    return $up_folder;
}


sub alignment_muscle {

    my ($out_gene_folder, $thread_number) = @_;

    my @input; 
    my @outfile;
    opendir OUT_GENE_FOLDER, $out_gene_folder or die "could not open $out_gene_folder";
    my @OUT_GENE_FOLDER = readdir OUT_GENE_FOLDER; 
    @OUT_GENE_FOLDER =grep ($_!~/^\./ ,@OUT_GENE_FOLDER);  #delete hidden file . ..
    closedir OUT_GENE_FOLDER;

    my $alignment_out_folder = go_up_folder($out_gene_folder)."/alignment_out_folder";
    mkdir "$alignment_out_folder";

    foreach my $file_protein(@OUT_GENE_FOLDER){
 
        my $input="$out_gene_folder/$file_protein"; 
        push (@input,$input);  
        @input = sort @input;
        my $output="$alignment_out_folder/$file_protein.alignmentout";
        push (@outfile,$output);
        @outfile = sort @outfile;

    }

    my $work_number = scalar @input;
    $thread_number = $work_number if $thread_number >= $work_number;

    my $thread;
    my @threads;
    my $job_number=0;
    print "\nMuscle_perform_percent ($work_number genes need to do): ";
    while(){ 
        last if ($job_number>=$work_number);                         
        while(scalar(threads->list())<$thread_number) {     #set threadnumber；
            $job_number++;                                 
            my $input_file = $input[$job_number-1];
            my $output_file = $outfile[$job_number-1];

            my $Muscle_perform_progress = int (($job_number/$work_number)*100);
            print "$Muscle_perform_progress","...";
            $threads[$job_number-1]=threads->new(\&do_muscle, $input_file, $output_file);  
            last if ($job_number>=$work_number);                          
            }

        foreach $thread(threads->list(threads::all)){
            if($thread->is_joinable()) {              
                $thread->join();                     
            }
        }

    } 
 
    foreach my $th (threads->list(threads::all)) {
        $th->join();
    }
    print "done\n";

    return $alignment_out_folder;
}



sub do_muscle {

   my ($input, $outfile) =@_;

   system ("muscle -in $input -out $outfile -quiet");

}




sub Extract_snp_alignment_dir {

    my $path = shift;
    opendir DIR, $path or die $!;
    my @dir = readdir DIR; 
    @dir =grep ($_!~/^\./ ,@dir);  #delete hidden file . .. 
    closedir DIR;
    my $snp_dir = go_up_folder($path)."/snp_dir";
    mkdir $snp_dir;

foreach my $each_file (@dir) {
    my $infile       = "$path/$each_file";
    my %seq_hash     = parse_fasta($infile, "F"); # key: seqid, value:sequence

    my $output="$snp_dir/$each_file.fasta";
    open(OUTPUT, ">$output");
    my %ID_number;
    my @snp_site_index;
    my $each_file_max_length;

    foreach my $key(sort keys %seq_hash) {

        my @array = split (//, $seq_hash{$key});
        $each_file_max_length = scalar @array;
        for (my $i=0; $i<=$#array; $i++) {

            $ID_number{$key."_".$i} = $array[$i];

        }

    }

    for (my $k=0; $k<$each_file_max_length; $k++) {
        my @check_repreat;
        foreach my $key(sort keys %seq_hash) {
            push (@check_repreat, $ID_number{$key."_".$k});
        }
    
 
        my %saw;
        @saw{@check_repreat} = ( );
        my @uniq_array = sort keys %saw;
        if (grep {$_ eq "-"} @uniq_array) {

            if (scalar @uniq_array ge 3) {

                push (@snp_site_index, $k+1);
            }
        }else {
        
            if (scalar @uniq_array ge 2) {

                push (@snp_site_index, $k+1);
            }
        }


    }

 
    @snp_site_index = sort{$a<=> $b}@snp_site_index;
    if (scalar @snp_site_index >=1) {
        foreach my $snp_key(sort keys %seq_hash) {
 
            my $print_seq;
            foreach (@snp_site_index){
                my $j=$_-1;
                #print  $ID_number{$snp_key."_".$j};
                $print_seq .= $ID_number{$snp_key."_".$j};
            }
            print OUTPUT ">$snp_key\n$print_seq\n";
        }
    } 
    close OUTPUT;

}

    return $snp_dir;
}



sub parse_fasta {

    my ($infile, $unify_id) = @_;
    my $term     = $/; # input record separator;
    my %seq_hash = (); # key:seqid, val:seq
    open my $INFILE, $infile or die "could not open infile '$infile' : $! \n"; 
    $/ = ">"; # input record separator;
    while(<$INFILE>) {

        chomp;

        next if($_ eq '');
        my ($id, @sequencelines) = split /\n/;
        if ($unify_id eq "T"){
            my @ww = split ";", $id;                                 #使每个文件的ID变为一致
            $id = $ww[2];
        }

        foreach my $line (@sequencelines) {
            $seq_hash{$id} .= $line;
        }
    }
    $/ = $term;
    return(%seq_hash);

} # end of parse_fasta

