This pipeline is very versatile - it can be used for anything from annotating prokaryotic genomes, to pangenome analyses, to identifying genes associated with a particular phenotype (AMR, diagnostic, etc), and beyond.
#!/bin/bash
#Created 07/01/2024 by: Jordan Zehr and Gayatri Anil
## invoke prokka, according to biohpc documentation ##
export PROKKA_CMD="singularity run -C -B $PWD --pwd $PWD /programs/prokka-1.14.5-r9/prokka.sif"
## in a directory that contains assembles, loop over all fna files ##
for file in ls salDubgenomes-test/GCA*.fna; do
#take the filename string and grab FROM character 19 to the end,
#then in that string get me the first 15 characters.
#This will be the GCA... which we will use later to set the output directory
## print out which GCA you are running on ##
echo "RUNNING PROKKA ON --> $acc"
## do a check to see if the output directory already exists ##
if [ -d prokka-dublin-results/${acc} ]; then
## if it does, print out that its already done and include the GCA ##
echo "ALREADY DONE with -- $acc"
## otherwise ##
else
#run prokka!
#--outdir - set the output directory
#--force - force it to make that dir, and use the GCA to create the dir
#--prefix - determines what to call the output dir
#$file - which GCA.fna prokka will run on.
$PROKKA_CMD prokka --outdir prokka-dublin-results/${acc} --force --prefix ${acc} $file
fi
done