Help with my fasta filter python scrips
1
0
Entering edit mode
20 months ago

Hi guys, I am very new and dummy with python and I try to write a script to filter some alignments. The filter works removing the sequences for every alignment with a chi2 < 0.05 and writing a new fasta files without those sequences, but I have a problem. I have two terminals in some of the alignments that are my out-groups and I dont want to remove them, buy my script remove those sequences because some of them dont pass the chi2 test, so maybe some one can help with that. I will appreciate a lot. Thanks

Here is the part of the code that do the chi2 test:

# Chi2 analysis para folders v1
import sys
import pandas as pd
from Bio import AlignIO
from scipy import stats
import numpy as np
import os
import datetime
import csv

# Obtener la fecha y hora actual para el registro de la ejecución
current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")

def compositionMatrix(aln):
    compDict = {}
    fixedCharacters = ["-","A","C","G","T"]
    for record in aln:
        header = record.id
        seq = record.seq.upper()  # Convert sequence to uppercase
        currentSeqMat = [0]*5
        for i in range(len(seq)):
            nt = seq[i]
            try:
                characterPos = fixedCharacters.index(nt)
                currentSeqMat[characterPos] += 1
            except:
                print("ERROR:", header, "contains character ("+nt+") not in the list:",fixedCharacters)
        compDict[header] = currentSeqMat
    compDF = pd.DataFrame.from_dict(compDict, orient='index', columns=fixedCharacters)
    return compDF

def chi2test(compDF):
    seqTotals = compDF.sum(axis=1)
    gaps = compDF["-"]
    gapsPerSeq = gaps/seqTotals

    nonGap = compDF.loc[:, 'A':'T']
    nonGapTotals = nonGap.sum().to_frame()
    nonGapSeqTotals = nonGap.sum(axis=1).to_frame()
    numCharacters = nonGapTotals.sum()
    expectedFreq = nonGapTotals / numCharacters

    expectedCountArray = np.dot(nonGapSeqTotals, expectedFreq.transpose())
    expectedCountDF = pd.DataFrame(expectedCountArray, columns=nonGap.columns, index=nonGap.index.values )
    chi2DF = ((expectedCountDF - nonGap)**2)/expectedCountDF
    chi2Sum = chi2DF.sum(axis=1)

    pValueDf = 1 - stats.chi2.cdf(chi2Sum, 3)
    outDF = pd.DataFrame({"Gap/Ambiguity":gapsPerSeq,"p-value":pValueDf})
    outDF.index.name = 'header'
    return outDF
###############################################################################################################
# Obtener una lista de todos los archivos en la carpeta de entrada
input_folder = sys.argv[1]
print(f"Carpeta de entrada: {input_folder}")
input_files = os.listdir(input_folder)

# Crear la carpeta de salida si no existe
output_folder = os.path.join(input_folder, "filtered_alignments")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Procesar cada archivo individualmente
for input_file in input_files:
    # Saltar cualquier archivo que no sea un archivo fasta
    if not input_file.endswith(".fasta"):
        continue

    # Leer el archivo fasta de entrada
    input_file_path = os.path.join(input_folder, input_file)
    alignment = AlignIO.read(open(input_file_path), "fasta")

    # Obtener el dataframe de la composición de aminoácidos
    compDF = compositionMatrix(alignment)

    # Aplicar la prueba de chi2 y filtrar las secuencias que no pasan la prueba
    outDF = chi2test(compDF)
    removed_seqs = outDF.index[outDF['p-value'] < 0.05].tolist()

    # Crear un nuevo objeto de alineamiento sin las secuencias eliminadas
    filtered_alignment = AlignIO.MultipleSeqAlignment([seq for seq in alignment if seq.id not in removed_seqs])

    # Escribir el archivo de salida con los valores de p
    output_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.tsv"))
    outDF.to_csv(output_file, sep='\t')

    # Escribir el archivo fasta sin las secuencias eliminadas
    output_fasta_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.fasta"))
    with open(output_fasta_file, 'w') as f:
        AlignIO.write(filtered_alignment, f, 'fasta')
###############################################################################################################

# Registro de la ejecución y resultados
log_file = os.path.join(input_folder, "filtered_log.txt")
csv_file = os.path.join(input_folder, "filtered_sequences.csv")

with open(log_file, 'w') as f:
    f.write(f"Programa iniciado: {formatted_time}\n")
    f.write(f"Carpeta de entrada: {input_folder}\n")
    f.write(f"Carpeta de salida: {output_folder}\n\n")

    results = []
    for input_file in input_files:
        if input_file.endswith(".fasta"):
            input_file_path = os.path.join(input_folder, input_file)
            f.write(f"Leyendo archivo fasta de entrada: {input_file}\n")
            alignment = AlignIO.read(open(input_file_path), "fasta")
            compDF = compositionMatrix(alignment)
            outDF = chi2test(compDF)
            removed_seqs = outDF.index[outDF['p-value'] < 0.05].tolist()
            if removed_seqs:
                f.write(f"Secuencias removidas en {input_file}:\n")
                for seq in removed_seqs:
                    f.write(f"- {seq}\n")
                    results.append((input_file, seq, outDF.loc[seq]['p-value']))

                filtered_alignment = AlignIO.MultipleSeqAlignment([seq for seq in alignment if seq.id not in removed_seqs])
                output_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.tsv"))
                outDF.to_csv(output_file, sep='\t')
                output_fasta_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.fasta"))
                with open(output_fasta_file, 'w') as f2:
                    AlignIO.write(filtered_alignment, f2, 'fasta')
                f.write(f"Archivos de salida guardados en {output_folder}\n")
            else:
                f.write(f"No se removieron secuencias en {input_file}\n")

    f.write("Programa finalizado.")

    # Guardar archivo CSV con los resultados
    with open(csv_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["Archivo", "Secuencia", "Valor de p"])
        for result in results:
            writer.writerow(result)

print("Se han generado los archivos de salida y el archivo CSV con los resultados.")
FASTA python chi2 • 1.2k views
ADD COMMENT
0
Entering edit mode
20 months ago
cfos4698 ★ 1.1k

One easy way would be to add the following into the script:

wanted_terminals = ['outgroup1','outgroup2']
removed_seqs = [x for x in removed_seqs if x not in set(wanted_terminals)]

For example:

removed_seqs = ['a','b','c','d','e']
wanted_terminals = ['c','e']
removed_seqs = [x for x in removed_seqs if x not in set(wanted_terminals)]
>>> print(removed_seqs)
['a', 'b', 'd']

Or you could do:

wanted_terminals = ['outgroup1','outgroup2']
removed_seqs = list(set(removed_seqs) - set(wanted_terminals))

Alternatively, a more concise but harder to read version can be done by modifying your script:

removed_seqs = [x for x in outDF.index[outDF['p-value'] < 0.05].tolist() if x not in ['outgroup1','outgroup2']]

Obviously you need to use the actual seq IDs of your outgroups.

ADD COMMENT
0
Entering edit mode

Thanks for you're answer I am going to test it and I let you know how works

Best

ADD REPLY

Login before adding your answer.

Traffic: 1760 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6