|
#!/bin/sh |
|
|
|
set -u |
|
|
|
# usage: fetchAllGenomesByTaxon.sh Daphnia_pulex Lepeophtheirus_salmonis |
|
# either use quotes or underscores |
|
# This is just to show how to define the taxon list inline if you don't want to read taxa from the command line |
|
#TAXLIST=("Daphnia pulex" "Drosophila melanogaster" "Anopheles gambiae" "Pediculus humanus" |
|
#"Ixodes scapularis" "Apis mellifera" "Bombyx mori") |
|
#TAXLIST=("Strigamia maritima") |
|
WGET_OPTS="-c --random-wait -t 40 -a wget.log" |
|
|
|
TAXLIST=$@ |
|
for TAX in "${TAXLIST[@]}" ; do |
|
echo getting genome for: $TAX |
|
#mkdir -p "$TAX" # if you want to create a directory |
|
#cd "$TAX" |
|
GENOME=$(esearch -db genome -query "${TAX}"[Organism:exp] | |
|
efetch -format docsum | tee "${TAX}.genome.esearch.docsum") |
|
ACC=`echo $GENOME | xtract -pattern DocumentSummary -element Assembly_Accession` |
|
NAME=`echo $GENOME | xtract -pattern DocumentSummary -element Assembly_Name` |
|
echo authoritative genome: $ACC $NAME |
|
RESULT=$(esearch -db assembly -query "$ACC" | |
|
efetch -format docsum | tee "${TAX}.assembly.esearch.docsum") |
|
FTPP=`echo $RESULT | xtract -pattern DocumentSummary -element FtpPath_GenBank` |
|
TAXID=`echo $RESULT | xtract -pattern DocumentSummary -element Taxid` |
|
echo FtpPath: $FTPP |
|
BASENAME=`basename $FTPP` |
|
FTPPATHG=$FTPP/$BASENAME'_genomic.fna.gz' |
|
FTPPATHP=$FTPP/$BASENAME'_protein.faa.gz' |
|
echo Downloading $FTPPATHG ... |
|
|
|
## get genome data |
|
wget $WGET_OPTS $FTPPATHG |
|
BASENAME=`basename $FTPPATHG` |
|
gunzip -f $BASENAME |
|
echo Downloading $FTPPATHP ... |
|
## get protein data |
|
wget $WGET_OPTS $FTPPATHP # this may throw an error |
|
if [ "$?" -eq "0" ] ; then |
|
BASENAME=`basename $FTPPATHP` |
|
gunzip -f $BASENAME |
|
fi |
|
# cd .. |
|
done |
You want to do orthologue identification with OMA and therefore the first task you describe above needs some correction to be successful:
I have a simple shell script that can download the proteome of the representative genome automatically if it exists. For genomes where the gene predictions pipeline has not been run, it cannot give you anything, however.
This might be helpful if you haven't come across it yet:
Expanding the Orthologous Matrix (OMA) programmatic interfaces: REST API and the OmaDB packages for R and Python
I want to fetch genomes from NCBI nor from oma
First i want to put ncbi species id to download genomes