# generate a properly formatted bed12:
# braker
gtf2gff.pl <../annotation/braker/braker/braker.gtf --out=/dev/stdout --printExon --gff3 | gff3_file_UTR_trimmer.pl /dev/stdin | gffread -T -o /dev/stdout -g "$1" /dev/stdin 2>/dev/null | awk 'BEGIN{FS=OFS="\t"} {split($9, a, " "); gsub(/\.[0-9]+";/, "\";", a[4]); $9=a[1]" "a[2]" "a[3]" "a[4]; print}' | gt gtf_to_gff3 | gt gff3 -sort -tidy -setsource braker -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee braker-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' > braker.bed12

# create faa files for each bed file to run busco
for i in ref maker braker eukan gemoma ; do
	awk 'BEGIN{FS=OFS="\t"; print "##gff-version 3"}{print $1, $7, $8, $2+1, $3, $5, $6, $9, $10}' ${i}.bed12 | gt gff3 -sort -retainids -tidy | gt select -retainids -hascds | gt id_to_md5 -seqfile "$1" -matchdescstart | gt gff3 -sort -retainids | gt extractfeat -type CDS -join -seqfile "$1" -retainids -translate | sed "/^>/ s/>/>${i}./" | cut -f1 -d' ' > ${i}.faa; done

exit 0

parallel --jobs 1 "busco -i {} --cpu 40 --lineage ./eurotiales_odb10 -m prot -o {/.}.busco-out -f" ::: maker.faa braker.faa eukan.faa gemoma.faa ref.faa
# summarise busco data
find . -name full_table.tsv -exec grep -v '^#' {} \; | sed 's/\s\+/\t/g' | awk 'BEGIN{FS=OFS="\t"; print "busco.id", "busco.status", "s.name", "busco.hmm.len"} {print $1, $2, $3, $5}' | grep -v Missing > busco-results.tsv

# formatting references
awk '$3=="mRNA"' Plasmodium_falciparum_3D7.gff | cut -f9 | cut -f1,2 -d';' | sed 's/;/\n/g' | cut -f2 -d'=' | sed 's/$/;/' | grep -Ff - Plasmodium_falciparum_3D7.gff | gff3_file_UTR_trimmer.pl /dev/stdin | sed '/^\s*$/d; 1 i\##gff-version 3' | gt gff3 -sort -tidy -setsource ref -addintrons | awk 'BEGIN{FS=OFS="\t"} $3=="gene"{split($9, a, ";"); $9=a[1]";"; print} $3=="mRNA"{split($9, a, "[;=]"); split($9, b, ";"); mRNA=a[2]; $9=b[1]";"b[2]";"; print} $3=="intron"{i++; $9="ID=intron"i";Parent="mRNA";"; print} $3=="exon"{j++; $9="ID=exon"j";Parent="mRNA";"; print} $3=="CDS"{k++; split($9, a, ";"); a[1]="ID=CDS"k; $9=a[1]";Parent="mRNA";"; print}' | tee ref-noUTRs.gff3 | gff2bed | awk 'BEGIN{FS=OFS="\t"}{$4=$7"."$4; split($10, a, "[=;]"); if ($10 ~ /Parent/) {a[2]=$7"."a[2]; a[4]=$7"."a[4]; $10=a[1]"="a[2]";"a[3]"="a[4]";"} else {a[2]=$7"."a[2]";"; $10=a[1]"="a[2]}}1' 
