caveat : I'm Christian's colleague
the following java program builds the GO tree using the "is_a" relationships, get the go-ACN under the user's term and filter GOA with those ACN.
A more efficient program would store everything into a local database.
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
public class Biostar55984
{
private static Logger LOG=Logger.getLogger("Biostar55984");
private static final String GOA="ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/HUMAN/gene_association.goa_human.gz";
private static final String GO_DAILY="http://archive.geneontology.org/go_daily-termdb.rdf-xml.gz";
private static final String NS="http://www.geneontology.org/dtds/go.dtd#";
private static final String RDF="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
private static final QName rdf_about=new QName(RDF, "about", "rdf");
private static final QName rdf_resource=new QName(RDF, "resource", "rdf");
private static class Term
{
String uri;
String acn;
Set<Term> children=new HashSet<Term>();
@Override
public int hashCode() {
return uri.hashCode();
}
@Override
public boolean equals(Object obj) {
return this.uri.equals(Term.class.cast(obj).uri);
}
void getChildrenAcn(Set<String> acns)
{
if(acn!=null) acns.add(acn);
for(Term t: children) t.getChildrenAcn(acns);
}
}
private Map<String,Term> uri2term=new HashMap<String,Term>(40000);
private Term getTermByUri(String uri)
{
Term term=uri2term.get(uri);
if(term==null)
{
term=new Term();
term.uri=uri;
uri2term.put(term.uri, term);
}
return term;
}
private void parseTerm(XMLEventReader r,StartElement rootE) throws Exception
{
Attribute att=rootE.getAttributeByName(rdf_about);
if(att==null ) throw new XMLStreamException("cannot find "+rdf_about+" in term");
Term term=getTermByUri(att.getValue());
while(r.hasNext())
{
XMLEvent evt=r.nextEvent();
if(evt.isStartElement())
{
StartElement startE=evt.asStartElement();
QName nameE=startE.getName();
if(!nameE.getNamespaceURI().equals(NS)) continue;
if(nameE.getLocalPart().equals("accession"))
{
term.acn=r.getElementText().trim();
}
else if(nameE.getLocalPart().equals("is_a"))
{
att=startE.getAttributeByName(rdf_resource);
if(att==null ) throw new XMLStreamException("cannot find "+rdf_resource+" in is_a "+term.acn);
String parentURI=att.getValue();
Term parent=getTermByUri(parentURI);
parent.children.add(term);
}
}
else if(evt.isEndElement())
{
EndElement endE=evt.asEndElement();
QName nameE=endE.getName();
if(!nameE.getNamespaceURI().equals(NS)) continue;
if(nameE.getLocalPart().equals("term")) break;
}
}
r.close();
}
private void parseTerms() throws Exception
{
LOG.info("reading "+GO_DAILY);
XMLInputFactory factory = XMLInputFactory.newInstance();
factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
InputStream in=new URL(GO_DAILY).openStream();
in=new GZIPInputStream(in);
XMLEventReader r= factory.createXMLEventReader(in);
while(r.hasNext())
{
XMLEvent evt=r.nextEvent();
if(evt.isStartElement())
{
StartElement startE=evt.asStartElement();
QName nameE=startE.getName();
if(!nameE.getNamespaceURI().equals(NS)) continue;
if(!nameE.getLocalPart().equals("term")) continue;
parseTerm(r,startE);
}
}
r.close();
in.close();
LOG.info("terms "+uri2term.size());
}
private Set<String> findChildrenOf(String acn)
{
Set<String> acns=new HashSet<String>();
for(Term t:this.uri2term.values())
{
if(!acn.equals(t.acn)) continue;
t.getChildrenAcn(acns);
}
return acns;
}
private void scanGOA(String acn) throws Exception
{
LOG.info("reading "+GOA);
Pattern tab=Pattern.compile("\t");
Set<String> acns=this.findChildrenOf(acn);
LOG.info("acns "+acns.size());
String line;
BufferedReader in=new BufferedReader(new InputStreamReader(new GZIPInputStream(new URL(GOA).openStream())));
while((line=in.readLine())!=null)
{
if(line.startsWith("!")) continue;
String tokens[]=tab.split(line);
if(!acns.contains(tokens[3])) continue;
System.out.println(line);
}
in.close();
}
/**
* @param args
*/
public static void main(String[] args) throws Exception
{/*
System.setProperty("http.proxyHost", "xxxxxxxxx");
System.setProperty("http.proxyPort", "xxxx");
System.setProperty("ftp.proxyHost", "xxxxxxxxx");
System.setProperty("ftp.proxyPort", "xxxx");*/
Biostar55984 app=new Biostar55984();
app.parseTerms();
app.scanGOA(args[0]);
}
}
compile:
javac Biostar55984.java
run:
$ java Biostar55984 "GO:0006915"
UniProtKB A0PJW8 GO:0006915 GO_REF:0000037 ECO:0000322 UniProtKB-KW:KW-0053 taxon:9606 20121027 UniProtKB
UniProtKB A1A4S6 GO:0006915 Reactome:REACT_578 ECO:0000304 taxon:9606 20080327 Reactome
UniProtKB A1L190 GO:0006915 GO_REF:0000024 ECO:0000250 UniProtKB:B5KM66 taxon:9606 20111130 UniProtKB
UniProtKB A2A341 GO:0006915 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000079909 taxon:9606 20121027 ENSEMBL
UniProtKB A5LHX3 GO:0006915 Reactome:REACT_578 ECO:0000304 taxon:9606 20100909 Reactome
UniProtKB A6NCW0 GO:0006915 GO_REF:0000037 ECO:0000322 UniProtKB-KW:KW-0053 taxon:9606 20121027 UniProtKB
UniProtKB A6NMY3 GO:0006915 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000071904 taxon:9606 20121027 ENSEMBL
UniProtKB A8K0U1 GO:0006915 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000028377 taxon:9606 20121027 ENSEMBL
UniProtKB A8MUK1 GO:0006915 GO_REF:0000037 ECO:0000322 UniProtKB-KW:KW-0053 taxon:9606 20121027 UniProtKB
UniProtKB A8MUM7 GO:0006915 GO_REF:0000037 ECO:0000322 UniProtKB-KW:KW-0053 taxon:9606 20121027 UniProtKB
UniProtKB A8MVM1 GO:0006915 GO_REF:0000002 ECO:0000256 InterPro:IPR015470|InterPro:IPR015917 taxon:9606 20121027 InterPro
UniProtKB B2CW77 GO:0006915 GO_REF:0000037 ECO:0000322 UniProtKB-KW:KW-0053 taxon:9606 20121027 UniProtKB
UniProtKB B4DEQ4 GO:0006915 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000003433 taxon:9606 20121027 ENSEMBL
UniProtKB B4DGU4 GO:0006915 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000007130 taxon:9606 20121027 ENSEMBL
UniProtKB B4DHJ7 GO:0051402 GO_REF:0000019 ECO:0000265 Ensembl:ENSRNOP00000023477 taxon:9606 20121027 ENSEMBL
UniProtKB B4DQU7 GO:0006915 GO_REF:0000002 ECO:0000256 InterPro:IPR015471|InterPro:IPR015917 taxon:9606 20121027 InterPro
UniProtKB B4DVD8 GO:0006915 GO_REF:0000002 ECO:0000256 InterPro:IPR015917 taxon:9606 20121027 InterPro
UniProtKB B4E2Z2 GO:0006915 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000024967 taxon:9606 20121027 ENSEMBL
UniProtKB B5MC21 GO:0001781 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000026845 taxon:9606 20121027 ENSEMBL
UniProtKB B5MCX1 GO:0006915 GO_REF:0000019 ECO:0000265 Ensembl:ENSMUSP00000074708 taxon:9606 20121027 ENSEMBL
(....)
the question is: how to retrieve all the genes annotated with a given GO-term and its descendants.