|
#!/usr/bin/python |
|
|
|
# This script is designed to take a genbank file and 'slice out'/'subset' |
|
# regions (genes/operons etc.) and produce a separate file. This can be |
|
# done explicitly by telling the script which base sites to use, or can |
|
# 'decide' for itself by blasting a fasta of the sequence you're inter- |
|
# ed in against the Genbank you want to slice a record out of. |
|
|
|
# Note, the script (obviously) does not preseve the index number of the |
|
# bases from the original |
|
|
|
# Based upon the tutorial at: |
|
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc44 |
|
|
|
# This script depends on BLAST and having the makeblastdb functionality |
|
# available if BLAST_MODE is active. It also depends on Biopython. |
|
|
|
# Set up and handle arguments: |
|
|
|
import warnings |
|
import sys |
|
import subprocess |
|
import os |
|
import time |
|
from Bio import SeqIO |
|
|
|
__author__ = "Joe R. J. Healey" |
|
__version__ = "1.1.0" |
|
__title__ = "Genbank_slicer" |
|
__license__ = "GPLv3" |
|
__author_email__ = "J.R.J.Healey@warwick.ac.uk" |
|
|
|
# Import SearchIO and suppress experimental warning |
|
from Bio import BiopythonExperimentalWarning |
|
with warnings.catch_warnings(): |
|
warnings.simplefilter('ignore', BiopythonExperimentalWarning) |
|
from Bio import SearchIO |
|
|
|
|
|
def convert(basename, genbank): |
|
'''Convert the provided genbank to a fasta to BLAST.''' |
|
|
|
refFasta = "{}.fasta.tmp".format(basename) |
|
SeqIO.convert(genbank, 'genbank', refFasta, 'fasta') |
|
|
|
return refFasta |
|
|
|
def runBlast(basename, refFasta, fasta, verbose): |
|
'''Synthesise BLAST commands and make system calls''' |
|
|
|
resultHandle = "{}.blastout.tmp".format(basename) |
|
blastdb_cmd = 'makeblastdb -in {0} -dbtype nucl -title temp_blastdb'.format(refFasta) |
|
blastn_cmd = 'blastn -query {0} -strand both -task blastn -db {1} -perc_identity 100 -outfmt 6 -out {2} -max_target_seqs 1'.format(fasta, refFasta, resultHandle) |
|
|
|
print("Constructing BLAST Database: " + '\n' + blastdb_cmd) |
|
print("BLASTing: " + '\n' + blastn_cmd) |
|
DB_process = subprocess.Popen(blastdb_cmd, |
|
shell=True, |
|
stdin=subprocess.PIPE, |
|
stdout=subprocess.PIPE, |
|
stderr=subprocess.PIPE) |
|
DB_process.wait() |
|
BLAST_process = subprocess.Popen(blastn_cmd, |
|
shell=True, |
|
stdin=subprocess.PIPE, |
|
stdout=subprocess.PIPE, |
|
stderr=subprocess.PIPE) |
|
BLAST_process.wait() |
|
|
|
return resultHandle |
|
|
|
def getIndices(resultHandle): |
|
'''If not provided directly by the user, this function retrieves the best BLAST hit's indices.''' |
|
|
|
blast_result = SearchIO.read(resultHandle, 'blast-tab') |
|
|
|
print(blast_result[0][0]) |
|
start = blast_result[0][0].hit_start |
|
end = blast_result[0][0].hit_end |
|
|
|
return start, end |
|
|
|
def slice(start, end, genbank, FPoffset, TPoffset): |
|
'''Subset the provided genbank to return the sub record.''' |
|
|
|
seqObj = SeqIO.read(genbank, 'genbank') |
|
subRecord = seqObj[start-FPoffset:end+TPoffset] |
|
|
|
return subRecord |
|
|
|
|
|
def main(): |
|
################################################################################################### |
|
# Parse command line arguments |
|
import argparse |
|
try: |
|
parser = argparse.ArgumentParser( |
|
description='This script slices entries such as genes or operons out of a genbank, subsetting them as their own file.') |
|
parser.add_argument( |
|
'-g', |
|
'--genbank', |
|
action='store', |
|
required=True, |
|
help='The genbank file you wish to subset.') |
|
parser.add_argument( |
|
'-o', |
|
'--outfile', |
|
action='store', |
|
help='If specifed, the script will write a file, otherwise redirect STDOUT for pipelines.') |
|
parser.add_argument( |
|
'-s', |
|
'--start', |
|
type=int, |
|
help='Integer base index to slice from.') |
|
parser.add_argument( |
|
'-e', |
|
'--end', |
|
type=int, |
|
default=0, |
|
help='Integer base index to slice to.') |
|
parser.add_argument( |
|
'-F', |
|
'--FPoffset', |
|
action='store', |
|
type=int, |
|
default=0, |
|
help='If you want to capture regions around the target site, specify the 5\' offset.') |
|
parser.add_argument( |
|
'-T', |
|
'--TPoffset', |
|
action='store', |
|
type=int, |
|
default=0, |
|
help='If you want to capture regions around the target site, specify the 3\' offset.') |
|
parser.add_argument( |
|
'-b', |
|
'--blastmode', |
|
action='store_true', |
|
help='If flag is set, provide an input fasta (-f | --fasta), and BLAST will be called to find your sequence indices in the original genbank.') |
|
parser.add_argument( |
|
'-f', |
|
'--fasta', |
|
action='store', |
|
help='The operon fasta to pull annotations from the provided genbank.') |
|
parser.add_argument( |
|
'-t', |
|
'--tidy', |
|
action='store_true', |
|
help='Tell the script whether or not to remove the temporary files it generated during processing. Off by default. WARNING: removes files based on the "tmp" string.') |
|
parser.add_argument( |
|
'-m', |
|
'--meta', |
|
action='store', |
|
default=None, |
|
help='Specify a string to use as the Genbank meta-data fields if the parent one doesn\'t contain anything. Else inherits from parent sequence.') |
|
parser.add_argument( |
|
'-v', |
|
'--verbose', |
|
action='store_true', |
|
help='Verbose behaviour. Shows progress of BLAST etc. if appropriate.') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
except NameError: |
|
print "An exception occured with argument parsing. Check your provided options." |
|
|
|
genbank = args.genbank |
|
fasta = args.fasta |
|
split = os.path.splitext(args.genbank) |
|
basename = os.path.basename(split[0]) |
|
start = args.start |
|
end = args.end |
|
FPoffset = args.FPoffset |
|
TPoffset = args.TPoffset |
|
blastMode = args.blastmode |
|
outfile = args.outfile |
|
fasta = args.fasta |
|
tidy = args.tidy |
|
meta = args.meta |
|
verbose = args.verbose |
|
|
|
# Main code: |
|
if blastMode is not False and fasta is not None: |
|
refFasta = convert(basename,genbank) |
|
resultHandle = runBlast(basename, refFasta, fasta, verbose) |
|
start, end = getIndices(resultHandle) |
|
else: |
|
if fasta is None: |
|
print("No fasta was provided so BLAST mode cannot be used.") |
|
|
|
if start is None or end is None: |
|
print('No slice indices have been specified or retrieved from blastout') |
|
sys.exit(1) |
|
|
|
|
|
subRecord = slice(start, end, genbank, FPoffset, TPoffset) |
|
# Populate the metadata of the genbank |
|
if meta is not None: |
|
subRecord.id = meta |
|
subRecord.locus = meta + 'subregion' |
|
subRecord.accession = meta |
|
subRecord.name = meta |
|
subRecord.annotations["date"] = time.strftime("%d-%b-%Y").upper() |
|
# Other field options include: |
|
# subRecord.annotations["source"] |
|
# ["taxonomy"] |
|
# ["keywords"] |
|
# ["references"] |
|
# ["accessions"] |
|
# ["data_file_division"] e.g. BCT, PLN, UNK |
|
# ["organism"] |
|
# ["topology"] |
|
|
|
if outfile is not None: |
|
SeqIO.write(subRecord, outfile, "genbank") |
|
else: |
|
print(subRecord.format('genbank')) |
|
|
|
if tidy is True: |
|
if verbose is True: |
|
rm="rm -v ./*tmp*" |
|
else: |
|
rm="rm ./*tmp*" |
|
|
|
subprocess.Popen(rm,shell=True) |
|
|
|
if __name__ == "__main__": |
|
main() |
Thanks, sounds like just what I need. I'll be reading through until I understand
Happy to explain any points that might not be clear. If you already have a blast tabular file with the sequence you'd like, we can easily come up with a function that gives you back all the sequences plus an offset, but have a go first so you can learn! :)
Hello, I've checked it out. So I don't think I need the convert() and runBlast() functions since I'm beginning with a blast output in an .xml format. I imported all the functions from Genbank_slicer and then did this:
So this will get return the start and end indices of the very first hit in the results.tab file. In my case the first hit is just the same sequence as my query so I don't get any flanks. In order to get the start and end for other hits I need to change the function so it returns blast_result[2][0].hit_start, blast_result[3][0].hit_start and so on. I guess I could use some kind of loop to do this and return the start and end for every hit. The numbers here just stand for rows and columns, right? I'm not worried so much about this part right now, although I can't figure out how to convert out.xml to a tabular format object without writing to a file and reading that first.
I'm stuck currently at the next function slice(). I don't want to slice parts out of the original genbank file, but the sequences in the blast output I got later. If I try it out on my blast output in genbank format: #This is after reading the blast results as tabular data getIndices('results.tab') slice(start, end, "sequence.gb", 100, 100) I get the result "NameError: name 'start' is not defined". This is kinda confusing because the variables returned by getIndices() and the arguments for slice() have the same name. But shouldn't "start" and "end" already be defined? And this function only looks at one sequence while I need to be able to load and slice every single sequence in a large genbank file.
For slice() would this work to return 100bp flanks on either side put together?
Where I've replaced FPoffset and TP offset with just offset (which is equal to 100).