am trying to parse the output from the following program, since I don't know what output I can use as features from the output. The program and the output shown below:
import static org.biojava3.ws.alignment.qblast.BlastAlignmentParameterEnum.ENTREZ_QUERY; import java.io.; import org.biojava3.core.sequence.io.util.IOUtils; import org.biojava3.ws.alignment.qblast.;
public class NCBIQBlastServiceDemo { private static final String BLAST_OUTPUT_FILE = "blastOutput.txt"; // file to save blast results to private static final String SEQUENCE = "MLLAVLYCLLWSFQTSAGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQNILLSNAPLGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERRLLVRRNIFDLSAPEKDKFFAYLTLAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLFVWMHYYVSMDALLGGSEIWRDIDFAHEAPAFLPWHRLFLLRWEQEIQKLTGDENFTIPYWDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLEEYNSHQSLCNGTPEGPLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASPLTGIADASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEANAPIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIWSWLLGAAMVGAV. LTALLAGLVSLLCRHKRKQLPEEKQPLLMEKEDYHSLYQSHL"; // Blast query sequence
/**
* @param args
*/
public static void main(String[] args) {
NCBIQBlastService service = new NCBIQBlastService();
// set alignment options
NCBIQBlastAlignmentProperties props = new NCBIQBlastAlignmentProperties();
props.setBlastProgram(BlastProgramEnum.blastp);
props.setBlastDatabase("uniprot");
props.setAlignmentOption(ENTREZ_QUERY, "\"serum albumin\"[Protein name] AND mammals[Organism]");
// set output options
NCBIQBlastOutputProperties outputProps = new NCBIQBlastOutputProperties();
// in this example we use default values set by constructor (XML format, pairwise alignment, 100 descriptions and alignments)
// Example of two possible ways of setting output options
// outputProps.setAlignmentNumber(200); // outputProps.setOutputOption(BlastOutputParameterEnum.ALIGNMENTS, "200");
String rid = null; // blast request ID
FileWriter writer = null;
BufferedReader reader = null;
try {
// send blast request and save request id
rid = service.sendAlignmentRequest(SEQUENCE, props);
// wait until results become available. Alternatively, one can do other computations/send other alignment requests
while (!service.isReady(rid)) {
System.out.println("Waiting for results. Sleeping for 5 seconds");
Thread.sleep(5000);
}
// read results when they are ready
InputStream in = service.getAlignmentResults(rid, outputProps);
reader = new BufferedReader(new InputStreamReader(in));
// write blast output to specified file
File f = new File(BLAST_OUTPUT_FILE);
System.out.println("Saving query results in file " + f.getAbsolutePath());
writer = new FileWriter(f);
String line;
while ((line = reader.readLine()) != null) {
writer.write(line + System.getProperty("line.separator"));
}
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
} finally {
// clean up
IOUtils.close(writer);
IOUtils.close(reader);
// delete given alignment results from blast server (optional operation)
service.sendDeleteRequest(rid);
}
}
// TODO Auto-generated method stub
}
// output:
<blastoutput> <blastoutput_program>blastp</blastoutput_program> <blastoutput_version>BLASTP 2.2.26+</blastoutput_version> <blastoutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.</blastoutput_reference> <blastoutput_db>swissprot</blastoutput_db> <blastoutput_query-id>15371</blastoutput_query-id> <blastoutput_query-def>unnamed protein product</blastoutput_query-def> <blastoutput_query-len>529</blastoutput_query-len> <blastoutput_param> <parameters> <parameters_matrix>BLOSUM62</parameters_matrix> <parameters_expect>10</parameters_expect> <parameters_gap-open>11</parameters_gap-open> <parameters_gap-extend>1</parameters_gap-extend> <parameters_filter>F</parameters_filter> </parameters> </blastoutput_param> <blastoutput_iterations> <iteration> <iteration_iter-num>1</iteration_iter-num> <iteration_query-id>15371</iteration_query-id> <iteration_query-def>unnamed protein product</iteration_query-def> <iteration_query-len>529</iteration_query-len> <iteration_hits> <hit> <hit_num>1</hit_num> <hit_id>gi|1351907|sp|P02769.4|ALBU_BOVIN</hit_id> <hit_def>RecName: Full=Serum albumin; AltName: Full=BSA; AltName: Allergen=Bos d 6; Flags: Precursor</hit_def> <hit_accession>P02769</hit_accession> <hit_len>607</hit_len> <hit_hsps> <hsp> <hsp_num>1</hsp_num> <hsp_bit-score>21.557</hsp_bit-score> <hsp_score>44</hsp_score> <hsp_evalue>1.44818</hsp_evalue> <hsp_query-from>342</hsp_query-from> <hsp_query-to>377</hsp_query-to> <hsp_hit-from>68</hsp_hit-from> <hsp_hit-to>101</hsp_hit-to> <hsp_query-frame>0</hsp_query-frame> <hsp_hit-frame>0</hsp_hit-frame> <hsp_identity>11</hsp_identity> <hsp_positive>16</hsp_positive> <hsp_gaps>2</hsp_gaps> <hsp_align-len>36</hsp_align-len> <hsp_qseq>NTLEGFASPLTGIADASQSSMHNALHIYMNGTMSQV</hsp_qseq> <hsp_hseq>NELTEFAK--TCVADESHAGCEKSLHTLFGDELCKV</hsp_hseq> <hsp_midline>N L FA T +AD S + +LH + +V</hsp_midline> </hsp> </hit_hsps> </hit> <hit> <hit_num>2</hit_num> <hit_id>gi|113582|sp|P14639.1|ALBU_SHEEP</hit_id> <hit_def>RecName: Full=Serum albumin; Flags: Precursor</hit_def> <hit_accession>P14639</hit_accession> <hit_len>607</hit_len> <hit_hsps> <hsp> <hsp_num>1</hsp_num> <hsp_bit-score>19.2458</hsp_bit-score> <hsp_score>38</hsp_score> <hsp_evalue>7.12869</hsp_evalue> <hsp_query-from>352</hsp_query-from> <hsp_query-to>377</hsp_query-to> <hsp_hit-from>76</hsp_hit-from> <hsp_hit-to>101</hsp_hit-to> <hsp_query-frame>0</hsp_query-frame> <hsp_hit-frame>0</hsp_hit-frame> <hsp_identity>7</hsp_identity> <hsp_positive>12</hsp_positive> <hsp_gaps>0</hsp_gaps> <hsp_align-len>26</hsp_align-len> <hsp_qseq>TGIADASQSSMHNALHIYMNGTMSQV</hsp_qseq> <hsp_hseq>TCVADESHAGCDKSLHTLFGDELCKV</hsp_hseq> <hsp_midline>T +AD S + +LH + +V</hsp_midline> </hsp> </hit_hsps> </hit> <hit> <hit_num>3</hit_num> <hit_id>gi|3121749|sp|O35090.1|ALBU_MERUN</hit_id> <hit_def>RecName: Full=Serum albumin; Flags: Precursor</hit_def> <hit_accession>O35090</hit_accession> <hit_len>609</hit_len> <hit_hsps> <hsp> <hsp_num>1</hsp_num> <hsp_bit-score>18.8606</hsp_bit-score> <hsp_score>37</hsp_score> <hsp_evalue>9.2992</hsp_evalue> <hsp_query-from>219</hsp_query-from> <hsp_query-to>250</hsp_query-to> <hsp_hit-from>88</hsp_hit-from> <hsp_hit-to>119</hsp_hit-to> <hsp_query-frame>0</hsp_query-frame> <hsp_hit-frame>0</hsp_hit-frame> <hsp_identity>8</hsp_identity> <hsp_positive>17</hsp_positive> <hsp_gaps>0</hsp_gaps> <hsp_align-len>32</hsp_align-len> <hsp_qseq>EQEIQKLTGDENFTIPYWDWRDAEKCDICTDE</hsp_qseq> <hsp_hseq>DKSLHTLFGDKLCSLPNFGEKYAEMADCCAKQ</hsp_hseq> <hsp_midline>++ + L GD+ ++P + + AE D C +</hsp_midline> </hsp> </hit_hsps> </hit> </iteration_hits> <iteration_stat> <statistics> <statistics_db-num>17</statistics_db-num> <statistics_db-len>9808</statistics_db-len> <statistics_hsp-len>0</statistics_hsp-len> <statistics_eff-space>0</statistics_eff-space> <statistics_kappa>0.041</statistics_kappa> <statistics_lambda>0.267</statistics_lambda> <statistics_entropy>0.14</statistics_entropy> </statistics> </iteration_stat> </iteration> </blastoutput_iterations> </blastoutput>
Do you really need to limit yourself to Java? Maybe if you could write, what you actually need to do, somebody could come up with quick solution in another language.