Hi, I have python script that able to download genome sequencing data from Ensembl plant database.
So, just I am trying to download Mouse genome data, but this script for only plant, how I change so I can download genome data from mammals also.
following is the script
#########################################################################################
#!/usr/bin/env python # some examples of files and locations # pub # lrwxrwxrwx 1 ftpuser ftpusers 30 Dec 7 16:33 current_homo_sapiens -> release-36/homo_sapiens_36_35i # #/pub/release-36/homo_sapiens_36_35i/data/fasta/dna #-rw-rw-r-- 1 ftpuser ftpusers 67675771 Nov 15 14:48 Homo_sapiens.NCBI35.dec.dna.chromosome.1.fa.gz #-rw-rw-r-- 1 ftpuser ftpusers 40802343 Nov 15 14:55 Homo_sapiens.NCBI35.dec.dna_rm.chromosome.1.fa.gz # #/pub/release-36/homo_sapiens_36_35i/data/fasta/pep #-rw-rw-r-- 1 ftpuser ftpusers 3817861 Nov 15 19:46 Homo_sapiens.NCBI35.dec.pep.known.fa.gz # #/pub/release-36/homo_sapiens_36_35i/data/mysql/homo_sapiens_core_36_35i #-rw-rw-r-- 1 ftpuser ftpusers 2957452 Dec 2 22:45 exon.txt.table.gz #-rw-rw-r-- 1 ftpuser ftpusers 1747738 Dec 2 22:45 exon_stable_id.txt.table.gz #-rw-rw-r-- 1 ftpuser ftpusers 1489045 Dec 2 22:45 exon_transcript.txt.table.gz #-rw-rw-r-- 1 ftpuser ftpusers 4626 Dec 2 21:57 homo_sapiens_core_36_35i.mysql40_compatible.sql.gz #-rw-rw-r-- 1 ftpuser ftpusers 4753 Dec 2 21:57 homo_sapiens_core_36_35i.sql.gz import os, os.path, re, sys from ftplib import FTP class collect: def __init__(self): self.data = [] def more(self, l): self.data.append(l) def maybeRetrFile(fromPath, toPath): what = 'from %s --> to %s' %(fromPath, toPath) if os.path.exists(toPath): print 'skipping '+what return else: if toPath.endswith('.gz') and os.path.exists(toPath[:-3]): print 'skipping (uncompressed) '+what return print what toFile = open(toPath, 'w') ec.retrbinary('RETR '+fromPath, toFile.write, blocksize=100000) toFile.close() target = sys.argv[1].strip().lower().replace(' ', '_') release = 'current/' if len(sys.argv) > 2: release = 'release-' + sys.argv[2] + '/' # set up initial connection host='ftp.ensemblgenomes.org' print 'Logging into '+host ec = FTP(host) ec.login() # look for target in a listing of pub files = collect() where='pub/'+release+'plants/'+'mysql' print 'Listing '+where ec.dir(where, files.more) tEntries = [l for l in files.data if target+"_core_" in l and '->' not in l ] if len(tEntries) != 1: print target + ' is either missing or not unique:' print tEntries print '\n'.join(files.data) ec.close() sys.exit(-1) # "parse" current link name curPat = re.compile(r''+target+'_core_(.+)_(.+)\Z') tPath = tEntries[0].split()[-1] mo = curPat.match(tPath) if not mo: print 'dont\'t understand release naming scheme: '+ tPath ec.close() sys.exit(-1) [maj, min] = mo.groups() majMin=maj+'_'+min outDir = target + '_' + majMin print 'Release: '+release[0:len(release)-1]+', '+'tPath: '+tPath+', '+'target: '+target+', '+'maj: '+maj+', '+'majMin: '+majMin+', '+'outDir: '+outDir ## if os.path.exists(outDir): ## print 'up to date: ' + tPath ## ec.close() ## sys.exit(0) # need to get files. first, set up directories. [dDir, mDir, pDir] = [outDir+d for d in ['/dna/', '/mysql/', '/pep/']] if not os.path.exists(dDir): os.makedirs(dDir, 0744) if not os.path.exists(mDir): os.makedirs(mDir, 0744) if not os.path.exists(pDir): os.makedirs(pDir, 0744) # retrieve dna dnaPat = re.compile(r'\.dna(_rm)?\.chromosome\..+\.fa\.gz\Z') dFiles = collect() where = 'pub/'+release+'plants/fasta/%s/dna' % target print 'Changing dir to '+where ec.dir(where, dFiles.more) dKeep = [l for l in dFiles.data if dnaPat.search(l)] for f in dKeep: fn = f.split()[-1] maybeRetrFile(where+'/'+fn, dDir+fn) # retrieve pep where = 'pub/'+release+'plants/fasta/%s/pep' % target pFiles = collect() print 'Changing dir to '+where ec.dir(where, pFiles.more) for f in pFiles.data: fn = f.split()[-1] maybeRetrFile(where+'/'+fn, pDir+fn) # retrieve mysql # older releases?: mFiles = ['exon.txt.table', 'exon_transcript.txt.table', 'gene_stable_id.txt.table', 'seq_region.txt.table', 'transcript.txt.table', 'translation.txt.table', 'translation_stable_id.txt.table', target+'_core_'+majMin+'.sql', target+'_core_'+majMin+'.mysql40_compatible.sql'] #older releases which have *_stable_id.txt: mFiles = ['exon.txt', 'exon_transcript.txt', 'gene_stable_id.txt', 'seq_region.txt', 'transcript.txt', 'translation.txt', 'translation_stable_id.txt', target+'_core_'+majMin+'.sql'] mFiles = ['exon.txt', 'exon_transcript.txt', 'seq_region.txt', 'transcript.txt', 'translation.txt', target+'_core_'+majMin+'.sql'] where = 'pub/'+release+'plants/mysql/%s_core_%s' % (target, majMin) print 'Changing dir to '+where for mf in mFiles: maybeRetrFile(where+'/'+mf+'.gz', mDir+mf+'.gz') # retrieve GTF where = 'pub/'+release+'gtf/%s' % (target) print 'Changing dir to '+where gtfPat = re.compile(r'\.gtf\.gz\Z') gFiles = collect() ec.dir(where, gFiles.more) gKeep = [l for l in gFiles.data if gtfPat.search(l)] for f in gKeep: fn = f.split()[-1] maybeRetrFile(where+'/'+fn, mDir+fn) ec.close() print 'Processing Fetched Files' #os.system('%s/processEnsemblFiles.sh %s' % (sys.path[0], outDir))
##############################################################################################################
First describe what you exactly need. Genomic fasta files? Annotations as gtf? Proteins as fasta? Because dumping everything is pointless if you are not going to use it.
Second, from a first look at the script, it does not select any subset of species from plant DB. So you have to find a way to select genomes of interests from a list of species. The brain dead but doable way is to get text file containing just interesting for you latin names starting with all species list obtainable i.e. here: ftp://ftp.ensembl.org/pub/release-81/fasta/ (each directory name here).
Once you got it:
It will be trivial to transform such list into wget commands:
Thanks for reply. I run this script like
and it is working perfectly. But about mammals I changed some path of ensembl database, but it is showing error...
This script automatically downloading dna, rm_dna, .pep and exon MySQL files that I need..but it is working only for plant at ensembl database
There can be many errors this script can report when trying to fetch files from different ftp site with different structure. BTW, do you set up a local copy of mammalian ENSEMBL DB or something like this? Because I do not see the reasons that at the same time you need all individual chromosome fasta files, whole genome masked and unmasked, plus bunch of mysql tables. Edit the script post as an answer. Please note that it is not a proper answer, just the script you posted with few lines changed. I strongly suggest you debug it using saccharomyces_cerevisiae as target to download and not any mammalian sized entry.