#!/usr/bin/perl
#12.07.09
#modified from:
#Author: zhuerle@163.com
#script for moving low quality and 3' 5' adapter and polyA
#output is a fastq
#an extra output contains in a matrix 3' end trimmed length and 5' end trimmed end
#it possibile to avoid the quality check on qc line
#a minimal number of quality_line with qc  greater than threshold can be defined by user

use strict;
use Getopt::Std;
use vars qw($opt_i $opt_o $opt_q $opt_n $opt_x $opt_y $opt_f $opt_h);
getopts('i:o:q:n:x:y:l:f:h');
my $fq_file        = $opt_i;
my $fqout_file     = $opt_o;
my $qc_thr         = $opt_q;
my $sp_name				 = $opt_n ? $opt_n : "sample";
my $adapter5       = $opt_x ? $opt_x : "GTTCAGAGTTCTACAGTCCGACGATC";
my $adapter3       = $opt_y ? $opt_y : "TCGTATGCCGTCTTCTGCTTG";
my $format         = $opt_f ? $opt_f : 2;
my $help           = $opt_h ? 1 : 0;



my $usage = << "USAGE";

Description: Perl script used to filter low quality short reads, remove polyA and trim 3' 5' adapter
Author: zhuerle\@163.com
Usage: perl Adapter_trim.pl [options] >outputfile
Options:
  -i <file>  Short reads file in fastq format 
  -o <file>  Short reads file in fastq OUT format 
  -q <str>   Minimal number of quality scores greater than the minimal threshold format; default=5
  -n <str>   Sample name; default="sample"
  -x <str>   5\' adaptor sequence, default="GTTCAGAGTTCTACAGTCCGACGATC"
  -y <str>   3\' adaptor sequence, default="TCGTATGCCGTCTTCTGCTTG"
  -f <int>   Fastq file format: 1=Sanger format; 2=Solexa/Illumina 1.0 format; 3=Illumina 1.3+ format; 4=No_quality_check format; default=4
  -h         Help
Examples: perl Adapter_trim.pl -i sample.fq -n "newid" -f 1 >outputfile
          perl Adapter_trim.pl -i sample.fq -x "ATCGGGCT" -y "TCGTAT" -f 3 >outputfile
USAGE

if ($help) 
{
	print $usage;
	exit;
}

unless (( -e $fq_file ) and ( $adapter5 =~ /[A|T|C|G]/i ) and ( $adapter3 =~ /[A|T|C|G]/i ) and ( $format =~ /^[1|2|3|4]$/ )) 
{
	print $usage;
	exit;
}
## filter low quality
open IN,"<$fq_file" or die $!;    #input file.fastq
open OUT,">$fqout_file" or die $!;   #output file.fastq
open OUT1,">$fq_file.triml" or die $!; #output stat trimming 3' end fist line 5'end second line
while (my $read_name = <IN>)
{
  my $read = <IN>;
	chomp $read;
	$read =~ tr/atcg/ATCG/;
	$adapter3 =~ tr/atcg/ATCG/;
	$adapter5 =~ tr/atcg/ATCG/;
	#quality lines
  my $qc_name = <IN>;
  my $qc = <IN>;
  chomp $qc;
	
    ###move 3' adapter
    my $read_af3 = &mvadapter($read,$adapter3,0);
    $qc = substr($qc, 0, length($read_af3));

    my $start3 = length($read) -  length($read_af3);
#    print OUT1 "3end\t";
    print OUT1 $start3;
    print OUT1 "\t";

    my $read_af5 = &mvadapter($read_af3,$adapter5,1);
    my $start5 = length($read_af3) -  length($read_af5);

#    print OUT1 "5end\t";
    print OUT1 $start5;
    print OUT1 "\n";

    $qc = substr($qc, $start5, length($read_af5));
    if (length($read_af5) > 16)
    {
        my $quality=0;
	      if ($format == 2)
	      {
			    $quality= &check_qlt($qc,64,9,$qc_thr);
	      }
	      elsif ($format == 1)
	     {
			    $quality= &check_qlt($qc,33,15,$qc_thr);
	     }
	     elsif ($format == 3)
	     {
			    $quality= &check_qlt($qc,64,10,$qc_thr);
      }
		      elsif ($format == 4)
	    {
			    $quality = 1;
	    }
      if ($quality == 1)
      {  
        print OUT $read_name;
        #remove the last three nts in case are still part of the linker
        chop $read_af5;
        chop $read_af5;
        chop $read_af5; 
        print OUT $read_af5;
        print OUT "\n";
        print OUT $qc_name;
        chop $qc;
        chop $qc;
        chop $qc;
        print OUT $qc;
        print OUT "\n";
      }  
    }
}
close IN;
close OUT;
close OUT1;


#########################################################################################################
#########################################################################################################
#########################################################################################################
#########################################################################################################
sub check_qlt
{
	my $quality_line = shift;
	my $asc = shift;
	my $tv = shift;
	my $qc_thr= shift;
	my $num = 0;
	my $count = 0;
	my @ql = split (//,$quality_line);
	my $wid = $#ql+1;
	foreach my $i (0..$#ql)
	{
		$num = ord($ql[$i])-$asc;
		if($num-$tv <0){$count++;}
	}

		if( $count > $qc_thr ){return 0;}       #discarding reads
		else {return 1;}
	
}


sub mvadapter
{
	my $read = shift;
	my $adapter = shift;
	my $mode = shift;
	my $readback;
	if ($mode == 1)
	{
		$read = reverse($read);
		$adapter = reverse($adapter);	
	}
	my $bl= length($read);
	my $tl= length($adapter);
	my @bemapped=split(//,$read);
	my @tomap=split(//,$adapter);
	my @record;
	for (my $i =0; $i<$bl;$i++)
	{
		my $match =0;
		my $mismatch = 0;
		for (my $n=0;$n<$tl;$n++)
		{
			last unless( $bemapped[$i+$n]);
			if($bemapped[$i+$n] eq $tomap[$n]){$match++;}
			else 
			{
				$mismatch++;
				last if ($mismatch >3);
			}		
		}
		my $long= $match+$mismatch;
		my $per = sprintf "%.2f",$mismatch/($match+$mismatch);
		if($mismatch < 4 and $per < 0.3 and $long >4)
		{
			push @record ,[$per,$i,$mismatch,$match];
		}	
	}
	
	if ($#record == 0)
	{ 
		$readback = substr($read,0,$record[0][1]);
		if ($mode ==1){$readback = reverse $readback;}
	}
	elsif($#record > 0)
	{
		my @record_sort = sort {$a->[0] <=> $b->[0]} @record;
		$readback = substr($read,0,$record_sort[0][1]);
		if ($mode ==1){$readback = reverse $readback;}
	}
	else {$readback = $mode ? reverse($read) : "null";}
	return $readback;
}


sub mvpolyA 
{
	my $read = shift;
	$read =~ /(A{3,})/i;
	my $rp=$1;	
	if ((length $rp) > 3) 
	{
		return 1;
	}
	else 
	{
		return 0;
	}
}