# generate a properly formatted bed12:
# ih
gff3_file_UTR_trimmer.pl ../annotation/eukan/final.gff3 | gffread -T -o /dev/stdout -g "$1" /dev/stdin 2>/dev/null | awk 'BEGIN{FS=OFS="\t"} {split($9, a, " "); $9=a[1]" "a[2]" "a[3]" "a[4]; print}' | gt gtf_to_gff3 | gt gff3 -sort -tidy -setsource eukan -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee eukan-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' > eukan.bed12
# braker
gtf2gff.pl <../annotation/braker/braker/braker.gtf --out=/dev/stdout --printExon --gff3 | gff3_file_UTR_trimmer.pl /dev/stdin | gffread -T -o /dev/stdout -g "$1" /dev/stdin 2>/dev/null | awk 'BEGIN{FS=OFS="\t"} {split($9, a, " "); gsub(/\.[0-9]+";/, "\";", a[4]); $9=a[1]" "a[2]" "a[3]" "a[4]; print}' | gt gtf_to_gff3 | gt gff3 -sort -tidy -setsource braker -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee braker-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' > braker.bed12
# maker
grep -v '^#' $(find ../annotation/maker -maxdepth 1 -name '*all.gff3') | awk 'BEGIN{print "##gff-version 3"} $2=="maker"' | gff3_file_UTR_trimmer.pl /dev/stdin | gffread -T -o /dev/stdout -g "$1" /dev/stdin 2>/dev/null | awk 'BEGIN{FS=OFS="\t"} {split($9, a, " "); $9=a[1]" "a[2]" "a[3]" "a[6]; print}' | gt gtf_to_gff3 | gt gff3 -sort -tidy -setsource maker -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee maker-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' > maker.bed12
# gemoma
awk 'BEGIN{FS=OFS="\t"} {if ($3=="CDS") {print; $3="exon"; print} else {print}}' ../annotation/gemoma/gemoma_out/final_annotation.gff | gff3_file_UTR_trimmer.pl /dev/stdin| gffread -T -o /dev/stdout -g "$1" /dev/stdin 2>/dev/null | awk 'BEGIN{FS=OFS="\t"} {split($9, a, " "); gsub(/\.[0-9]+\";$/, "\";", a[4]); $9=a[1]" "a[2]" "a[3]" "a[4]; print}' | gt gtf_to_gff3 | gt gff3 -sort -tidy -setsource gemoma -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee gemoma-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' > gemoma.bed12
# funannotate
awk '$3=="mRNA"' ../annotation/funannotate/funannotate_train/update_results/funannotate_*.gff3 | cut -f9 | sed 's/;/\n/g' | sed -n 's/ID=//p; s/Parent=//p' | grep -Ff - ../annotation/funannotate/funannotate_train/update_results/funannotate_*.gff3 | sed '1 i\##gff-version 3' | gff3_file_UTR_trimmer.pl /dev/stdin | gffread -T -o /dev/stdout -g "$1" /dev/stdin 2>/dev/null | awk 'BEGIN{FS=OFS="\t"} {split($9, a, " "); gsub(/\.[0-9]+\";$/, "\";", a[4]); $9=a[1]" "a[2]" "a[3]" "a[4]; print}' | gt gtf_to_gff3 | gt gff3 -sort -tidy -setsource funannotate -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2];$9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee funannotate-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' > funannotate.bed12
# snowyowl
#awk 'BEGIN{FS=OFS="\t"}{if ($3=="CDS") {print; gsub(/cds/,"exon", $0); $3="exon"}; if($2=="GeneMark.hmm" && $3=="exon") {$3="CDS"; print; gsub(/cds/, "exon", $0);}}1' snowyowl.gff3 | grep -v '^#' | awk 'BEGIN{FS=OFS="\t"; c=0; print "##gff-version 3"}{if($3=="mRNA" && $4 != start && $5 != end) {c++; split($9, a, ";"); mRNAfeat=a[1]";Parent=gene_"c";"; genefeat="ID=gene_"c";"; $3="gene"; $9=genefeat; print; $3="mRNA"; $9=mRNAfeat; print} else if ($3=="mRNA" && ($4 == start || $5 == end)) {split($9, a, ";"); $9=a[1]";Parent=gene_"c";"; print} else {print}} {if ($3=="mRNA") {start=$4; end=$5}}' | grep -v Alias= | gt gff3 -setsource gemoma -tidy -sort -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";");$9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' > snowyowl.bed12

# create faa files for each bed file to run busco
for i in ref maker braker eukan gemoma funannotate; do
	awk 'BEGIN{FS=OFS="\t"; print "##gff-version 3"}{print $1, $7, $8, $2+1, $3, $5, $6, $9, $10}' ${i}.bed12 | gt gff3 -sort -retainids -tidy | gt select -retainids -hascds | gt id_to_md5 -seqfile "$1" -matchdescstart | gt gff3 -sort -retainids | gt extractfeat -type CDS -join -seqfile "$1" -retainids -translate | sed "/^>/ s/>/>${i}./" | cut -f1 -d' ' > ${i}.faa; done

exit 0

parallel --jobs 1 "busco -i {} --cpu 40 --lineage ./eurotiales_odb10 -m prot -o {/.}.busco-out -f" ::: maker.faa braker.faa eukan.faa gemoma.faa funannotate.faa ref.faa
# summarise busco data
find . -name full_table.tsv -exec grep -v '^#' {} \; | sed 's/\s\+/\t/g' | awk 'BEGIN{FS=OFS="\t"; print "busco.id", "busco.status", "s.name", "busco.hmm.len"} {print $1, $2, $3, $5}' | grep -v Missing > busco-results.tsv

# formatting references
awk '$3=="mRNA"' Plasmodium_falciparum_3D7.gff | cut -f9 | cut -f1,2 -d';' | sed 's/;/\n/g' | cut -f2 -d'=' | sed 's/$/;/' | grep -Ff - Plasmodium_falciparum_3D7.gff | gff3_file_UTR_trimmer.pl /dev/stdin | sed '/^\s*$/d; 1 i\##gff-version 3' | gt gff3 -sort -tidy -setsource ref -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee ref-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' 
