Question

Help with my fasta filter python scrips

0

Entering edit mode

2.2 years ago

Luis Carlos • 0

Hi guys, I am very new and dummy with python and I try to write a script to filter some alignments. The filter works removing the sequences for every alignment with a chi2 < 0.05 and writing a new fasta files without those sequences, but I have a problem. I have two terminals in some of the alignments that are my out-groups and I dont want to remove them, buy my script remove those sequences because some of them dont pass the chi2 test, so maybe some one can help with that. I will appreciate a lot. Thanks

Here is the part of the code that do the chi2 test:

# Chi2 analysis para folders v1
import sys
import pandas as pd
from Bio import AlignIO
from scipy import stats
import numpy as np
import os
import datetime
import csv

# Obtener la fecha y hora actual para el registro de la ejecución
current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")

def compositionMatrix(aln):
    compDict = {}
    fixedCharacters = ["-","A","C","G","T"]
    for record in aln:
        header = record.id
        seq = record.seq.upper()  # Convert sequence to uppercase
        currentSeqMat = [0]*5
        for i in range(len(seq)):
            nt = seq[i]
            try:
                characterPos = fixedCharacters.index(nt)
                currentSeqMat[characterPos] += 1
            except:
                print("ERROR:", header, "contains character ("+nt+") not in the list:",fixedCharacters)
        compDict[header] = currentSeqMat
    compDF = pd.DataFrame.from_dict(compDict, orient='index', columns=fixedCharacters)
    return compDF

def chi2test(compDF):
    seqTotals = compDF.sum(axis=1)
    gaps = compDF["-"]
    gapsPerSeq = gaps/seqTotals

    nonGap = compDF.loc[:, 'A':'T']
    nonGapTotals = nonGap.sum().to_frame()
    nonGapSeqTotals = nonGap.sum(axis=1).to_frame()
    numCharacters = nonGapTotals.sum()
    expectedFreq = nonGapTotals / numCharacters

    expectedCountArray = np.dot(nonGapSeqTotals, expectedFreq.transpose())
    expectedCountDF = pd.DataFrame(expectedCountArray, columns=nonGap.columns, index=nonGap.index.values )
    chi2DF = ((expectedCountDF - nonGap)**2)/expectedCountDF
    chi2Sum = chi2DF.sum(axis=1)

    pValueDf = 1 - stats.chi2.cdf(chi2Sum, 3)
    outDF = pd.DataFrame({"Gap/Ambiguity":gapsPerSeq,"p-value":pValueDf})
    outDF.index.name = 'header'
    return outDF
###############################################################################################################
# Obtener una lista de todos los archivos en la carpeta de entrada
input_folder = sys.argv[1]
print(f"Carpeta de entrada: {input_folder}")
input_files = os.listdir(input_folder)

# Crear la carpeta de salida si no existe
output_folder = os.path.join(input_folder, "filtered_alignments")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Procesar cada archivo individualmente
for input_file in input_files:
    # Saltar cualquier archivo que no sea un archivo fasta
    if not input_file.endswith(".fasta"):
        continue

    # Leer el archivo fasta de entrada
    input_file_path = os.path.join(input_folder, input_file)
    alignment = AlignIO.read(open(input_file_path), "fasta")

    # Obtener el dataframe de la composición de aminoácidos
    compDF = compositionMatrix(alignment)

    # Aplicar la prueba de chi2 y filtrar las secuencias que no pasan la prueba
    outDF = chi2test(compDF)
    removed_seqs = outDF.index[outDF['p-value'] < 0.05].tolist()

    # Crear un nuevo objeto de alineamiento sin las secuencias eliminadas
    filtered_alignment = AlignIO.MultipleSeqAlignment([seq for seq in alignment if seq.id not in removed_seqs])

    # Escribir el archivo de salida con los valores de p
    output_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.tsv"))
    outDF.to_csv(output_file, sep='\t')

    # Escribir el archivo fasta sin las secuencias eliminadas
    output_fasta_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.fasta"))
    with open(output_fasta_file, 'w') as f:
        AlignIO.write(filtered_alignment, f, 'fasta')
###############################################################################################################

# Registro de la ejecución y resultados
log_file = os.path.join(input_folder, "filtered_log.txt")
csv_file = os.path.join(input_folder, "filtered_sequences.csv")

with open(log_file, 'w') as f:
    f.write(f"Programa iniciado: {formatted_time}\n")
    f.write(f"Carpeta de entrada: {input_folder}\n")
    f.write(f"Carpeta de salida: {output_folder}\n\n")

    results = []
    for input_file in input_files:
        if input_file.endswith(".fasta"):
            input_file_path = os.path.join(input_folder, input_file)
            f.write(f"Leyendo archivo fasta de entrada: {input_file}\n")
            alignment = AlignIO.read(open(input_file_path), "fasta")
            compDF = compositionMatrix(alignment)
            outDF = chi2test(compDF)
            removed_seqs = outDF.index[outDF['p-value'] < 0.05].tolist()
            if removed_seqs:
                f.write(f"Secuencias removidas en {input_file}:\n")
                for seq in removed_seqs:
                    f.write(f"- {seq}\n")
                    results.append((input_file, seq, outDF.loc[seq]['p-value']))

                filtered_alignment = AlignIO.MultipleSeqAlignment([seq for seq in alignment if seq.id not in removed_seqs])
                output_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.tsv"))
                outDF.to_csv(output_file, sep='\t')
                output_fasta_file = os.path.join(output_folder, input_file.replace(".fasta", "_filtered.fasta"))
                with open(output_fasta_file, 'w') as f2:
                    AlignIO.write(filtered_alignment, f2, 'fasta')
                f.write(f"Archivos de salida guardados en {output_folder}\n")
            else:
                f.write(f"No se removieron secuencias en {input_file}\n")

    f.write("Programa finalizado.")

    # Guardar archivo CSV con los resultados
    with open(csv_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["Archivo", "Secuencia", "Valor de p"])
        for result in results:
            writer.writerow(result)

print("Se han generado los archivos de salida y el archivo CSV con los resultados.")

FASTA python chi2 • 1.5k views

ADD COMMENT • link 2.2 years ago by Luis Carlos • 0

score 0 · Answer 1 · 2023-04-24

One easy way would be to add the following into the script:

wanted_terminals = ['outgroup1','outgroup2']
removed_seqs = [x for x in removed_seqs if x not in set(wanted_terminals)]

For example:

removed_seqs = ['a','b','c','d','e']
wanted_terminals = ['c','e']
removed_seqs = [x for x in removed_seqs if x not in set(wanted_terminals)]

>>> print(removed_seqs)
['a', 'b', 'd']

Or you could do:

wanted_terminals = ['outgroup1','outgroup2']
removed_seqs = list(set(removed_seqs) - set(wanted_terminals))

Alternatively, a more concise but harder to read version can be done by modifying your script:

removed_seqs = [x for x in outDF.index[outDF['p-value'] < 0.05].tolist() if x not in ['outgroup1','outgroup2']]

Obviously you need to use the actual seq IDs of your outgroups.