import sys
import os
import datetime
import logging
import argparse
python_lib = os.path.join(os.getenv("HOME"), "python_lib")
if not python_lib in sys.path: sys.path.append(python_lib)
from fs import FS
log_level = logging.DEBUG
logging.basicConfig(filename=None, format='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=log_level)
def main():
fs = FS()
args = get_arguments()
num_all_variants = 0
num_PASS_variants = 0
num_PASS_SNV_variants = 0
num_PASS_INS_variants = 0
num_PASS_DEL_variants = 0
for line in fs.read(args.input_vcf_file):
if line.startswith("##"):
continue
elif line.startswith("#"):
line = line[1:]
header = line.split("\t")
else:
num_all_variants = num_all_variants + 1
fields = line.split("\t")
REF = fields[header.index("REF")]
ALT = fields[header.index("ALT")]
FILTER = fields[header.index("FILTER")]
if FILTER == "PASS":
num_PASS_variants = num_PASS_variants + 1
if len(REF) == 1 and len(ALT) == 1:
num_PASS_SNV_variants = num_PASS_SNV_variants + 1
if len(REF) > 1 and len(ALT) == 1:
num_PASS_DEL_variants = num_PASS_DEL_variants + 1
if len(REF) == 1 and len(ALT) > 1:
num_PASS_INS_variants = num_PASS_INS_variants + 1
output = [num_all_variants, num_PASS_variants, num_PASS_SNV_variants, num_PASS_INS_variants, num_PASS_DEL_variants]
header_out = "vcf_file\tall_variants\tPASS_variants\tPASS_SNV\tPASS_INS\tPASS_DEL"
#print header_out.replace(", ", "\t")
#print(header_out)
#print(os.path.basename(args.input_vcf_file) + "\t" + "\t".join(map(str,output)))
fs.write("VCF.xls", header_out+"\n")
fs.write("VCF.xls", os.path.basename(args.input_vcf_file)[:50] + "\t" + "\t".join(map(str,output)) + "\n")
#logging.info(out_file)
fs.close_without_print()
#print "Done\n\n\n"
os.system("easy VCF.xls")
sys.exit()
def get_arguments():
main_description = '''\
Count VCF
'''
epi_log='''
This program requires the python packages:
- argparse
- openpyxl (pypm install "openpyxl<=2.1.5")
- logging
'''
help_help = '''\
show this help message and exit\
'''
version_help = '''\
show the version of this program\
'''
input_vcf_file_help = '''\
input VCF file
'''
arg_parser = argparse.ArgumentParser(description=main_description, epilog=epi_log, formatter_class=argparse.RawTextHelpFormatter, add_help=False)
arg_parser.register('type','bool', lambda s: str(s).lower() in ['true', '1', 't', 'y', 'yes']) # add type keyword to registries
###############################
# 1. required arguments #
###############################
required_group = arg_parser.add_argument_group("required arguments")
###############################
# 2. optional arguments #
###############################
optional_group = arg_parser.add_argument_group("optional arguments")
###############################
# 3. positional arguments #
###############################
arg_parser.add_argument('input_vcf_file', metavar="VCF_FILE", type=str, help=input_vcf_file_help)
optional_group.add_argument("-h", "--help", action="help", help=help_help)
optional_group.add_argument("-v", "--version", action="version", version="%(prog)s: version 1.0", help=version_help)
args = arg_parser.parse_args()
abs_paths = ["input_vcf_file"]
create_dirs = []
must_exist_paths = ["input_vcf_file"]
check_arguments(args, arg_parser, abs_paths, create_dirs, must_exist_paths)
return args
def check_arguments(args, arg_parser, abs_paths, create_dirs, must_exist_paths):
'''
check_arguments:
(1) change the paths into absolute paths
(2) create diretories if needed
(3) check if the paths exists in the file system
'''
for abs_path in abs_paths:
obj = eval("args." + abs_path)
if isinstance(obj, list):
# list of files
exec("args.{abs_path} = [os.path.abspath(x) for x in args.{abs_path}]".format(abs_path=abs_path)) in globals(), locals()
else:
# single file
exec("args.{abs_path} = os.path.abspath(args.{abs_path})".format(abs_path=abs_path)) in globals(), locals()
for create_dir in create_dirs:
dir_path = eval("args." + create_dir)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
logging.debug("%s is created!\n" % dir_path)
for must_exist_path in must_exist_paths:
obj = eval("args." + must_exist_path)
if isinstance(obj, list):
# list of files
for path in obj:
if not os.path.exists(path):
arg_parser.print_help()
logging.error("No file exist as \"%s\" \n" % path )
sys.exit()
else:
# single file
path = obj
if not os.path.exists(path):
arg_parser.print_help()
logging.error("\"%s\" does not exist!!!\n" % path )
sys.exit()
if __name__ == '__main__':
main()
Hi, can somebody add comments for knowing "Novel SNPs" in a .vcf file ?. Many Thanks.
You should post this as a separate question seeing as it has nothing to do with counting SNPs, and also you posted this as an answer which it isn't. I would suggest you post a new question, that works a lot better because this post already has 3 answers so most people will probably ignore it
Alright, i agree with your comment but since my original question was almost similar as "Method to count total number of SNPs and Novel SNP in a VCF file" so i though that most of the peoples will give me link of this post only so i asked the second half of my question on this post. Anyway, i'll post the second half as a new question. Thanks.
That's not the same question, and even if the question was already posted it's always good to get a fresh answer.