#!/bin/bash

# usage function
usage() { 
	echo
	printf "Usage: %s [OPTIONS] <ARGS>\n
	[OPTIONS] and corresponding <ARGS> are:\n
	Default is generic eukaryotic annotation.
	[-F] Specify to activate fungal annotation
	[-B] Specify to activate bacterial annotation
	Necessary:
	[-g] <assembled genome fasta file>
	[-p] <protein homology fasta file, separated by ',' if more than 1>
	optional, but very helpful:
	[-e] <organism-specific EST fasta or transcriptome assembly fasta>
	and/or:
	[-a] <organism-specific mapped transcript gff3>
	not necessary:
	[-r] <repeat library fasta file if it exists, else it'll be built>
	[-i] <gff containing intron intervals determined from read mapping>
	[-n] <number of CPUs> --Default is MAX
	[-h] Display this help message
	\n", "$0" 1>&2; exit 1; 
}

# file check function
fc () {
	[ -f "$1" ] && return 0 || echo ERROR: could not find file "$1"; exit 2
}

# Make sure there are arguments supplied to the script
if [ $# -lt 1 ] ; then
	echo No options were supplied to the script. See usage.
	usage
fi

# snap training routine
snap_train () {
	round=$1
	rm -rf snap"$round" 2>/dev/null
	mkdir snap"$round"
	cp "$MAKERGFF" snap"$round"/
	cd snap"$round"
	maker2zff -n "$MAKERGFF"
	fathom -categorize 1000 genome.ann genome.dna
	fathom -export 1000 -plus uni.ann uni.dna
	forge export.ann export.dna
	hmm-assembler.pl "$name" . > ../"$name.snap.round_$round.hmm"
	cd ..
}

# augustus training routine
augustus_train () {
	round=$1
	cd snap"$round"
	#rm -rf "$AUGUSTUS_CONFIG_PATH/species/$name"*
	zff2augustus_gbk.pl > augustus.model_"$round".gb
	eval randomSplit.pl augustus.model_"$round".gb "$(echo "$(grep -c '^LOCUS' augustus.model_"$round".gb)/10" | bc)"
	eval new_species.pl --species="$name.augustus.round_$round"
	etraining --species="$name.augustus.round_$round" augustus.model_"$round".gb.train
	augustus --species="$name.augustus.round_$round" augustus.model_"$round".gb.test > augustus.round_"$round".check1
	nice -19 optimize_augustus.pl --species="$name.augustus.round_$round" --onlytrain=augustus.model_"$round".gb.train --cpus="$NUMPROC" augustus.model_"$round".gb.test
	etraining --species="$name.augustus.round_$round" augustus.model_"$round".gb.train
	augustus --species="$name.augustus.round_$round" augustus.model_"$round".gb.test > augustus.round_"$round".check2
	cd ..
	mkdir augustus"$round"
	mv snap"$round"/augustus* augustus"$round"/
}

# GM training routine
genemark_train () {
	round=$1
	if [ "$FUNGUS" = "true" ]; then
		if [ "$INTRONGFF" ]; then
			nice -19 gmes_petap.pl --ET "$INTRONGFF" --fungus --cores "$NUMPROC"  --sequence "$GENOME" --et_score 5
		else
			nice -19 gmes_petap.pl --ES --fungus --cores "$NUMPROC" --sequence "$GENOME"
		fi
		cp output/gmhmm.mod "$name.GM.mod"
		mkdir GM
		mv info output run.cfg data run genemark.gtf gmes.log GM/
	else
		if [ "$INTRONGFF" ]; then
			nice -19 gmes_petap.pl --ET "$INTRONGFF" --cores "$NUMPROC"  --sequence "$GENOME" --et_score 5
		else
			nice -19 gmes_petap.pl --ES --cores "$NUMPROC" --sequence "$GENOME"
		fi
		cp output/gmhmm.mod "$name.GM.mod"
		mkdir GM
		mv info output run.cfg data run genemark.gtf gmes.log GM/
	fi
}

####################################################################
# Variables + commandline parsing
####################################################################

# Run the MAKER control file generator
maker -CTL
CTLFILE=maker_opts.ctl

# Command-line parsing
while getopts 'g:p:a:e:r:n:hFBl:i:' OPTION
do
	case $OPTION in
		g)	fc "$OPTARG"
			GENOME="$OPTARG"
			name=$(echo "$GENOME" | sed -r 's/.(fa|fasta|fas|fna)$//')
			sed -i "/^genome=/s/=/=$OPTARG/1" $CTLFILE;;
		p)	sed -i "/^protein=/s/=/=$OPTARG/1" $CTLFILE;;
		a)	fc "$OPTARG"
			ESTGFF="$OPTARG"
			sed -i "/^est_gff=/ s/=/=$ESTGFF/1" $CTLFILE;;
		e)	fc "$OPTARG"
			ESTFASTA="$OPTARG"
			sed -i "/^est=/ s/=/&$ESTFASTA/1" $CTLFILE;;
		r)	REP="$OPTARG"
			fc "$OPTARG"
			sed -i "/^rmlib/s/=/=$REP/1" $CTLFILE;;
		i)	INTRONGFF="$OPTARG"
			fc "$OPTARG";;
		n)	NUMPROC=$OPTARG;;
		F)	FUNGUS="true";;
#		l)	LINEAGE="$OPTARG";;
		h)	usage;;
		?)	usage;;
	esac
done
shift $((OPTIND - 1))
if [ "$NUMPROC" = "" ]; then
	NUMPROC=$(lscpu | awk '/^CPU\(s\):/ {print $2}') # default number of CPUs in Maker and Interproscan
fi

# Maker data index file (for fetching gff and fasta files)
DATASTORE="$(pwd)/$name.maker.output/$name"_master_datastore_index.log

# GFF files generated at each maker run
MAKERGFF="$(pwd)/$name.all.gff3"

# Final fasta file created by extracting info from GFF
PROTS="$name.all.maker.proteins.fasta"
XCRIPTS="$name.all.maker.transcripts.fasta"

####################################################################
# Initial conditions to run Maker with (maker_opts.ctl file)
####################################################################

# Allow mapping of RNA + protein data to the genome for Exonerate polishing
if [ "$ESTGFF" -o "$ESTFASTA" ] # if transcript data exists
then # align it to the genome; else don't because it can't give useful info otherwise
	sed -i "/^est2genome/s/0/1/1;" $CTLFILE
fi
sed -i "/^protein2genome/s/0/1/1;
		/^single_exon/s/0/1/1;
		/^model_org/s/all//1;
		/^min_contig/s/1/4000/1" $CTLFILE

sed -i "/^keep_preds/s/0/1/1" $CTLFILE

mkdir tmp
sed -i "/^TMP=/s/TMP=/TMP=$(pwd | sed 's/\//\\\//g')\/tmp/" $CTLFILE

####################################################################
# Maker round 1
####################################################################

cp $CTLFILE $CTLFILE.round_1
echo First round of maker.
nice -19 mpiexec -n "$NUMPROC" maker -RM_off -q --ignore_nfs_tmp 1>/dev/null
# group up all generated.gff3
gff3_merge -d "$DATASTORE" -o "$MAKERGFF"
echo "first round of maker done"

####################################################################
# gene training round 1
####################################################################

snap_train 1
sed -i "/^snaphmm/s/=/=$name.snap.round_1.hmm/1" $CTLFILE
echo "first snap round done"
augustus_train 1
echo "first aug done"
genemark_train
echo "first gm done"

# Done with hard evidence
if [ "$ESTGFF" -o "$ESTFASTA" ] # if EST data was used 
then # no more RNA mapping is needed 
	sed -i "/^est2genome/s/1/0/1" $CTLFILE
fi
sed -i "/^protein2genome/s/1/0/1;
		/^augustus_species/s/=/=$name.augustus.round_1/1;
		/^gmhmm/s/=/=$name.GM.mod/1" $CTLFILE
###################################################################
# Maker round 2
###################################################################

cp $CTLFILE $CTLFILE.round_2
echo Second round of maker.
nice -19 mpiexec -n "$NUMPROC" maker -RM_off -q --ignore_nfs_tmp

gff3_merge -d "$DATASTORE" -o "$MAKERGFF"
echo "second maker done"
####################################################################
# gene training round 2
####################################################################

snap_train 2
sed -i "/^snaphmm/s/=$name.snap.round_1.hmm/=$name.snap.round_2.hmm/1" $CTLFILE
echo "second snap done"
augustus_train 2
echo "second aug done"
sed -i "/^augustus_species/s/_1/_2/1" $CTLFILE

###################################################################
# Maker round 3
###################################################################

sed -i "/^alt_splice/ s/=0/=1/" $CTLFILE
cp $CTLFILE $CTLFILE.round_3
echo Third round of maker.
nice -19 mpiexec -n "$NUMPROC" maker -RM_off -q --ignore_nfs_tmp

# Final merging of gff3 and fasta files
gff3_merge -d "$DATASTORE" -o "$MAKERGFF"
fasta_merge -d "$DATASTORE"
echo "third maker done"

exit
