|
package org.lindenb.acn2taxonomy; |
|
|
|
import java.io.BufferedReader; |
|
import java.io.File; |
|
import java.io.IOException; |
|
import java.io.InputStream; |
|
import java.io.InputStreamReader; |
|
import java.net.URL; |
|
import java.net.URLConnection; |
|
import java.util.ArrayList; |
|
import java.util.List; |
|
import java.util.logging.Level; |
|
import java.util.logging.Logger; |
|
import java.util.regex.Pattern; |
|
|
|
import javax.xml.parsers.DocumentBuilder; |
|
import javax.xml.parsers.DocumentBuilderFactory; |
|
import javax.xml.parsers.SAXParser; |
|
import javax.xml.parsers.SAXParserFactory; |
|
|
|
import org.lindenb.berkeley.db.PrimaryDB; |
|
import org.lindenb.io.IOUtils; |
|
import org.lindenb.me.Me; |
|
import org.lindenb.util.C; |
|
import org.lindenb.util.Compilation; |
|
import org.lindenb.util.StringUtils; |
|
import org.lindenb.xml.XMLUtilities; |
|
import org.w3c.dom.Document; |
|
import org.w3c.dom.Element; |
|
import org.xml.sax.Attributes; |
|
import org.xml.sax.InputSource; |
|
import org.xml.sax.SAXException; |
|
import org.xml.sax.helpers.DefaultHandler; |
|
|
|
import com.sleepycat.bind.tuple.IntegerBinding; |
|
import com.sleepycat.bind.tuple.TupleBinding; |
|
import com.sleepycat.bind.tuple.TupleInput; |
|
import com.sleepycat.bind.tuple.TupleOutput; |
|
import com.sleepycat.je.DatabaseConfig; |
|
import com.sleepycat.je.Environment; |
|
import com.sleepycat.je.EnvironmentConfig; |
|
|
|
public class AcnToTaxonomy |
|
{ |
|
private static final Logger LOG=Logger.getLogger("org.lindenb"); |
|
private File baseDir=new File(System.getProperty("java.io.tmpdir")); |
|
private File dbHome=null; |
|
private Environment environment=null; |
|
private PrimaryDB<Integer, TaxonNode> id2taxon=null; |
|
private DocumentBuilder docBuilder; |
|
private long sleep_time=100; |
|
|
|
private static class TaxonNode |
|
{ |
|
int id; |
|
String name=""; |
|
int parent_id=-1; |
|
} |
|
|
|
private static class TaxonBinding |
|
extends TupleBinding<TaxonNode> |
|
{ |
|
@Override |
|
public TaxonNode entryToObject(TupleInput in) |
|
{ |
|
TaxonNode n=new TaxonNode(); |
|
n.id=in.readInt(); |
|
n.name=in.readString(); |
|
n.parent_id=in.readInt(); |
|
return n; |
|
} |
|
@Override |
|
public void objectToEntry(TaxonNode node, TupleOutput out) |
|
{ |
|
out.writeInt(node.id); |
|
out.writeString(node.name); |
|
out.writeInt(node.parent_id); |
|
} |
|
} |
|
|
|
private class TinyXmlHandler |
|
extends DefaultHandler |
|
{ |
|
private StringBuilder text=null; |
|
private int TSeq_taxid=-1; |
|
private String TSeq_defline=null; |
|
private String error=null; |
|
TinyXmlHandler(String acn) |
|
{ |
|
|
|
} |
|
@Override |
|
public void startElement(String uri, String localName, String name, |
|
Attributes attributes) throws SAXException |
|
{ |
|
text=null; |
|
if(StringUtils.isIn(name,"TSeq_taxid","TSeq_defline","Error")) |
|
{ |
|
this.text=new StringBuilder(); |
|
} |
|
} |
|
@Override |
|
public void endElement(String uri, String localName, String name) throws SAXException |
|
{ |
|
if(name.equals("TSeq_taxid")) { this.TSeq_taxid= Integer.parseInt(this.text.toString());} |
|
else if(name.equals("TSeq_defline")) { this.TSeq_defline= this.text.toString();} |
|
else if(name.equals("Error")) { this.error= this.text.toString();} |
|
text=null; |
|
} |
|
@Override |
|
public void characters(char[] ch, int start, int length) |
|
throws SAXException { |
|
if(this.text!=null) text.append(ch, start, length); |
|
} |
|
} |
|
|
|
private AcnToTaxonomy() |
|
throws Exception |
|
{ |
|
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); |
|
f.setCoalescing(true); |
|
f.setNamespaceAware(false); |
|
f.setValidating(false); |
|
f.setExpandEntityReferences(true); |
|
f.setIgnoringComments(true); |
|
f.setIgnoringElementContentWhitespace(true); |
|
this.docBuilder= f.newDocumentBuilder(); |
|
} |
|
|
|
|
|
private void open() throws IOException |
|
{ |
|
this.dbHome=IOUtils.createTempDir(this.baseDir); |
|
LOG.info("created "+this.dbHome); |
|
EnvironmentConfig envConfig= new EnvironmentConfig(); |
|
envConfig.setAllowCreate(true); |
|
envConfig.setReadOnly(false); |
|
this.environment= new Environment(dbHome, envConfig); |
|
LOG.info("opened bdbd env"); |
|
DatabaseConfig dbConfig=new DatabaseConfig(); |
|
dbConfig.setAllowCreate(true); |
|
dbConfig.setReadOnly(false); |
|
this.id2taxon=new PrimaryDB<Integer, TaxonNode>(this.environment, null, "id2taxon", dbConfig, new IntegerBinding(), new TaxonBinding()); |
|
} |
|
|
|
private void close() |
|
{ |
|
if(this.id2taxon!=null) |
|
{ |
|
LOG.info("closing database"); |
|
this.id2taxon.close(); |
|
this.id2taxon=null; |
|
} |
|
if(this.environment!=null) |
|
{ |
|
LOG.info("closing bdbd env"); |
|
this.environment.close(); |
|
this.environment=null; |
|
} |
|
if(this.dbHome!=null) |
|
{ |
|
for(File f: this.dbHome.listFiles()) |
|
{ |
|
f.delete(); |
|
} |
|
this.dbHome.delete(); |
|
this.dbHome=null; |
|
} |
|
} |
|
private InputStream openURL(URL url)throws IOException |
|
{ |
|
final int max_try=10; |
|
for(int try_count=0;try_count<max_try;++try_count) |
|
{ |
|
InputStream is=null; |
|
try |
|
{ |
|
URLConnection con=url.openConnection(); |
|
con.setConnectTimeout(10*1000); |
|
is=con.getInputStream(); |
|
return is; |
|
} |
|
catch(Exception err) |
|
{ |
|
System.err.println("Cannot open "+url+" trying... "+(try_count+1)+"/"+try_count); |
|
try |
|
{ |
|
Thread.sleep(10*1000); |
|
} |
|
catch (InterruptedException e) |
|
{ |
|
} |
|
} |
|
} |
|
throw new IOException("Cannot open "+url); |
|
} |
|
|
|
|
|
|
|
private StringBuilder taxopath(int taxonid,StringBuilder str) throws Exception |
|
{ |
|
TaxonNode node=this.id2taxon.get(null, taxonid); |
|
if(node==null) |
|
{ |
|
String url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id="+taxonid+"&retmode=xml&tool=acn2tax&email=plindenbaum_at_yahoo_fr"; |
|
InputStream in=openURL(new URL(url)); |
|
Document dom= this.docBuilder.parse(new InputSource(in)); |
|
in.close(); |
|
Element root=dom.getDocumentElement(); |
|
Element Taxon=XMLUtilities.one(root, "Taxon"); |
|
|
|
node=new TaxonNode(); |
|
|
|
Element TaxId=XMLUtilities.one(Taxon, "TaxId"); |
|
Element ScientificName=XMLUtilities.one(Taxon, "ScientificName"); |
|
node.id= Integer.parseInt(TaxId.getTextContent()); |
|
node.name= ScientificName.getTextContent(); |
|
|
|
Element LineageEx=XMLUtilities.one(Taxon, "LineageEx"); |
|
List<Element> taxons= XMLUtilities.elements(LineageEx, "Taxon"); |
|
|
|
List<TaxonNode> nodes= new ArrayList<TaxonNode>(taxons.size()); |
|
for(Element e: taxons) |
|
{ |
|
TaxId=XMLUtilities.one(e, "TaxId"); |
|
ScientificName=XMLUtilities.one(e, "ScientificName"); |
|
TaxonNode newnode= new TaxonNode(); |
|
newnode.id= Integer.parseInt(TaxId.getTextContent()); |
|
newnode.name= ScientificName.getTextContent(); |
|
nodes.add(newnode); |
|
} |
|
nodes.add(node); |
|
|
|
for(int i=1;i< nodes.size();i++) |
|
{ |
|
nodes.get(i).parent_id=nodes.get(i-1).id; |
|
if(!this.id2taxon.containsKey(null,nodes.get(i).id)) |
|
{ |
|
this.id2taxon.put(null,nodes.get(i).id,nodes.get(i)); |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
str.insert(0,"\""+C.escape(node.name)+"\"("+node.id+")"+(str.length()==0?"":" > ")); |
|
} |
|
if(node.parent_id>0) |
|
{ |
|
taxopath(node.parent_id,str); |
|
} |
|
return str; |
|
} |
|
|
|
|
|
|
|
private void run(BufferedReader in) throws Exception |
|
{ |
|
SAXParserFactory f= SAXParserFactory.newInstance(); |
|
f.setNamespaceAware(false); |
|
f.setValidating(false); |
|
SAXParser parser=f.newSAXParser(); |
|
Pattern pattern=Pattern.compile("[a-z][a-z_0-9]+(\.[0-9]+)?",Pattern.CASE_INSENSITIVE); |
|
String line; |
|
while((line=in.readLine())!=null) |
|
{ |
|
if(line.startsWith("#")) continue; |
|
line=line.trim(); |
|
if(line.isEmpty()) continue; |
|
if(!pattern.matcher(line).matches()) |
|
{ |
|
System.err.println("Invalid acn "+line+" does not match "+pattern.pattern()); |
|
continue; |
|
} |
|
String api_url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id="+ |
|
line+ |
|
"&rettype=fasta&retmode=xml&tool=acn2tax&email=plindenbaum_at_yahoo_fr" |
|
; |
|
LOG.info(api_url); |
|
|
|
URL url=new URL(api_url); |
|
InputStream is=openURL(url); |
|
TinyXmlHandler handler=new TinyXmlHandler(line); |
|
parser.parse(is, handler); |
|
is.close(); |
|
if(handler.error!=null) |
|
{ |
|
System.err.println("#Error: cannot get "+line+" : "+handler.error); |
|
} |
|
else |
|
{ |
|
StringBuilder taxonpath=taxopath(handler.TSeq_taxid,new StringBuilder()); |
|
System.out.println(line+"\t\""+C.escape(handler.TSeq_defline)+"\"\t"+taxonpath); |
|
} |
|
try { Thread.sleep(this.sleep_time);}catch(Exception e2) {} |
|
} |
|
} |
|
|
|
public static void main(String[] args) |
|
{ |
|
AcnToTaxonomy app=null; |
|
try |
|
{ |
|
app=new AcnToTaxonomy(); |
|
LOG.setLevel(Level.OFF); |
|
int optind=0; |
|
while(optind< args.length) |
|
{ |
|
if(args[optind].equals("-h") || |
|
args[optind].equals("-help") || |
|
args[optind].equals("--help")) |
|
{ |
|
System.err.println(Me.FIRST_NAME+" "+Me.LAST_NAME+" "+Me.MAIL); |
|
System.err.println(Compilation.getLabel()); |
|
System.err.println("Options:"); |
|
System.err.println(" -b <dir> base directory for bdb files:"+app.baseDir); |
|
System.err.println(" --log-level <level> one of "+Level.class.getName()); |
|
System.err.println(" -h help; This screen."); |
|
return; |
|
} |
|
else if(args[optind].equals("--log-level")) |
|
{ |
|
LOG.setLevel(Level.parse(args[++optind])); |
|
} |
|
else if(args[optind].equals("-b")) |
|
{ |
|
app.baseDir=new File(args[optind++]); |
|
if(!app.baseDir.exists()) |
|
{ |
|
System.err.println("File does not exist: "+app.baseDir); |
|
return; |
|
} |
|
if(!app.baseDir.isDirectory()) |
|
{ |
|
System.err.println("File is not a directory: "+app.baseDir); |
|
return; |
|
} |
|
break; |
|
} |
|
else if(args[optind].equals("--")) |
|
{ |
|
optind++; |
|
break; |
|
} |
|
else if(args[optind].startsWith("-")) |
|
{ |
|
System.err.println("Unknown option "+args[optind]); |
|
return; |
|
} |
|
else |
|
{ |
|
break; |
|
} |
|
++optind; |
|
} |
|
app.open(); |
|
if(optind==args.length) |
|
{ |
|
app.run(new BufferedReader(new InputStreamReader(System.in))); |
|
} |
|
else |
|
{ |
|
while(optind< args.length) |
|
{ |
|
java.io.BufferedReader r= IOUtils.openReader(args[optind++]); |
|
app.run(r); |
|
r.close(); |
|
} |
|
} |
|
} |
|
catch(Throwable err) |
|
{ |
|
err.printStackTrace(); |
|
} |
|
finally |
|
{ |
|
if(app!=null) app.close(); |
|
} |
|
} |
|
} |
hum, not sure I understand what is your input... An example ?
Can you also show an example of the table output from blast? Anyway it is better to use the xml output as it is more stable over time. Also, are you doing this in any particular programming language or tool?