|
/** |
|
* Author: Pierre Lindenbaum PhD |
|
* WWW: http://plindenbaum.blogspot.com |
|
* Motivation: |
|
* Given a gene, identify the world experts |
|
* http://biostar.stackexchange.com/questions/4296 |
|
*/ |
|
import java.net.URLEncoder; |
|
import java.text.Collator; |
|
import java.util.ArrayList; |
|
import java.util.Collections; |
|
import java.util.Comparator; |
|
import java.util.HashSet; |
|
import java.util.List; |
|
import java.util.Locale; |
|
import java.util.Set; |
|
import java.util.TreeSet; |
|
import java.util.logging.Level; |
|
import java.util.logging.Logger; |
|
|
|
import javax.xml.parsers.DocumentBuilder; |
|
import javax.xml.parsers.DocumentBuilderFactory; |
|
import javax.xml.stream.XMLOutputFactory; |
|
import javax.xml.stream.XMLStreamWriter; |
|
import javax.xml.xpath.XPath; |
|
import javax.xml.xpath.XPathConstants; |
|
import javax.xml.xpath.XPathFactory; |
|
|
|
import org.w3c.dom.Document; |
|
import org.w3c.dom.Node; |
|
import org.w3c.dom.NodeList; |
|
|
|
|
|
public class BioStar4296 |
|
{ |
|
private Logger LOG=Logger.getLogger(BioStar4296.class.getName()); |
|
private String organism="Homo Sapiens"; |
|
private DocumentBuilder docBuilder; |
|
private XPath xpath; |
|
private Collator collator; |
|
|
|
|
|
static class Author |
|
{ |
|
String suffix=""; |
|
String firstName=""; |
|
String lastName=""; |
|
String initials=""; |
|
Set<String> mails=new HashSet<String>(); |
|
Set<Integer> pmids=new TreeSet<Integer>(); |
|
int factor=1; |
|
Set<String> affilitations=new HashSet<String>(); |
|
@Override |
|
public int hashCode() { |
|
final int prime = 31; |
|
int result = 1; |
|
result = prime * result |
|
+ ((firstName == null) ? 0 : firstName.hashCode()); |
|
result = prime * result |
|
+ ((lastName == null) ? 0 : lastName.hashCode()); |
|
return result; |
|
} |
|
@Override |
|
public boolean equals(Object obj) { |
|
if (this == obj) |
|
return true; |
|
if (obj == null) |
|
return false; |
|
if (getClass() != obj.getClass()) |
|
return false; |
|
Author other = (Author) obj; |
|
if (firstName == null) { |
|
if (other.firstName != null) |
|
return false; |
|
} else if (!firstName.equals(other.firstName)) |
|
return false; |
|
if (lastName == null) { |
|
if (other.lastName != null) |
|
return false; |
|
} else if (!lastName.equals(other.lastName)) |
|
return false; |
|
return true; |
|
} |
|
@Override |
|
public String toString() { |
|
return firstName+" "+lastName+" lab:"+this.affilitations+" mails:"+this.mails; |
|
} |
|
|
|
void write(XMLStreamWriter w) |
|
throws Exception |
|
{ |
|
w.writeStartElement("Person"); |
|
w.writeCharacters("\n"); |
|
|
|
w.writeStartElement("firstName"); |
|
w.writeCharacters(firstName); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
|
|
w.writeStartElement("lastName"); |
|
w.writeCharacters(lastName); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
|
|
for(Integer s:pmids) |
|
{ |
|
w.writeStartElement("pmid"); |
|
w.writeCharacters(String.valueOf(s)); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
|
|
for(String s:mails) |
|
{ |
|
w.writeStartElement("mail"); |
|
w.writeCharacters(s); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
for(String s:affilitations) |
|
{ |
|
w.writeStartElement("affilitation"); |
|
w.writeCharacters(s); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
|
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
} |
|
|
|
private BioStar4296() throws Exception |
|
{ |
|
LOG.setLevel(Level.OFF); |
|
|
|
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); |
|
f.setNamespaceAware(false); |
|
f.setCoalescing(true); |
|
f.setIgnoringComments(true); |
|
f.setIgnoringElementContentWhitespace(true); |
|
f.setValidating(false); |
|
this.docBuilder=f.newDocumentBuilder(); |
|
|
|
XPathFactory factory=XPathFactory.newInstance(); |
|
this.xpath=factory.newXPath(); |
|
|
|
this.collator= Collator.getInstance(Locale.FRENCH); |
|
this.collator.setStrength(Collator.PRIMARY); |
|
} |
|
|
|
private int search(XMLStreamWriter w,String geneName) |
|
throws Exception |
|
{ |
|
w.writeCharacters("\n"); |
|
w.writeStartElement("gene"); |
|
w.writeAttribute("name", geneName); |
|
String url= "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term="+ |
|
URLEncoder.encode(geneName+"[PREF] \""+this.organism+"\"[ORGN]", "UTF-8"); |
|
LOG.info(url); |
|
Document dom=this.docBuilder.parse(url); |
|
NodeList list=(NodeList)this.xpath.evaluate( |
|
"/eSearchResult/IdList/Id", |
|
dom,XPathConstants.NODESET); |
|
if(list.getLength()==0) |
|
{ |
|
w.writeComment("Cannot find any entry for "+geneName); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
else if(list.getLength()!=1) |
|
{ |
|
w.writeComment("Ambigous name "+geneName); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
String geneId= list.item(0).getTextContent(); |
|
LOG.info("GeneId:"+geneId); |
|
w.writeAttribute("geneId", geneId); |
|
|
|
url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id="+ |
|
geneId+ |
|
"&rettype=text&retmode=xml"; |
|
LOG.info(url); |
|
dom=this.docBuilder.parse(url); |
|
list=(NodeList)this.xpath.evaluate( |
|
"//PubMedId", |
|
dom,XPathConstants.NODESET); |
|
if(list.getLength()==0) |
|
{ |
|
w.writeComment("No pubmed for "+geneName); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
List<Author> authors=new ArrayList<Author>(); |
|
Set<Integer> pmidSet=new TreeSet<Integer>(); |
|
for(int articleIdx=0;articleIdx< list.getLength();++articleIdx) |
|
{ |
|
String pmid= list.item(articleIdx).getTextContent(); |
|
LOG.info("PMID:"+pmid); |
|
pmidSet.add(Integer.parseInt(pmid)); |
|
} |
|
w.writeAttribute("count-pmids",String.valueOf(pmidSet.size())); |
|
w.writeCharacters("\n"); |
|
|
|
for(Integer pmid: pmidSet) |
|
{ |
|
|
|
LOG.info("PMID:"+pmid); |
|
url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="+pmid+"&retmode=xml"; |
|
LOG.info("url:"+url); |
|
dom=this.docBuilder.parse(url); |
|
Node n=(Node)this.xpath.evaluate("//Affiliation", dom,XPathConstants.NODE); |
|
if(n==null) continue; |
|
String affiliation=n.getTextContent(); |
|
String adressFragments[]=affiliation.split("[ \t\:\<,\>\(\)]"); |
|
LOG.info("affiliation:"+affiliation); |
|
NodeList authorList=(NodeList)this.xpath.evaluate( |
|
"//AuthorList/Author", |
|
dom,XPathConstants.NODESET); |
|
LOG.info("Authors:"+authorList.getLength()); |
|
if(authorList.getLength()==0) continue; |
|
|
|
for(int j=0;j< authorList.getLength();++j) |
|
{ |
|
boolean collective=false; |
|
Author author=new Author(); |
|
for(Node c1=authorList.item(j).getFirstChild();c1!=null;c1=c1.getNextSibling()) |
|
{ |
|
if(c1.getNodeType()!=Node.ELEMENT_NODE) continue; |
|
String tag=c1.getNodeName(); |
|
String content= c1.getTextContent(); |
|
if(tag.equals("LastName")) |
|
{ |
|
author.lastName= content; |
|
} |
|
else if(tag.equals("FirstName") || tag.equals("ForeName")) |
|
{ |
|
author.firstName= content; |
|
} |
|
else if(tag.equals("Initials")) |
|
{ |
|
author.initials= content; |
|
} |
|
|
|
else if(tag.equals("CollectiveName")) |
|
{ |
|
collective=true; |
|
break; |
|
} |
|
else if(tag.equals("Suffix")) |
|
{ |
|
author.suffix= content; |
|
} |
|
} |
|
if(collective) continue; |
|
LOG.info("Make New Author:"+author); |
|
int k=0; |
|
for(k=0;k< authors.size();++k) |
|
{ |
|
Author p=authors.get(k); |
|
if( !p.firstName.isEmpty() && |
|
this.collator.compare(p.firstName,author.firstName)==0 && |
|
this.collator.compare(p.lastName,author.lastName)==0) |
|
{ |
|
LOG.info("Same: "+p+" "+author); |
|
author=p; |
|
break; |
|
} |
|
} |
|
|
|
if(k==authors.size()) |
|
{ |
|
k=0; |
|
for(k=0;k< authors.size();++k) |
|
{ |
|
Author p=authors.get(k); |
|
if( |
|
( |
|
(!author.initials.isEmpty() && p.firstName.toLowerCase().startsWith(author.initials.toLowerCase())) || |
|
(!p.initials.isEmpty() && author.firstName.toLowerCase().startsWith(p.initials) )|| |
|
this.collator.compare(p.initials,author.initials)==0 )&& |
|
this.collator.compare(p.lastName,author.lastName)==0) |
|
{ |
|
LOG.info("Same: "+p+" "+author); |
|
if(p.firstName.length()< author.firstName.length()) |
|
{ |
|
p.firstName=author.firstName; |
|
} |
|
author=p; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
if(k==authors.size()) |
|
{ |
|
LOG.info("Adding: "+author); |
|
authors.add(author); |
|
} |
|
author.factor*=j; |
|
author.affilitations.add(affiliation); |
|
author.pmids.add(pmid); |
|
|
|
if(affiliation.indexOf('@')!=-1) |
|
{ |
|
for(String mail: adressFragments) |
|
{ |
|
mail.replaceAll("\{\}", ""); |
|
if(mail.endsWith(".")) mail= mail.substring(0,mail.length()-1); |
|
int index=mail.indexOf('@'); |
|
if(index==-1) continue; |
|
String mailPrefix=mail.substring(0,index).toLowerCase(); |
|
|
|
if(mailPrefix.contains(author.lastName.toLowerCase()) || |
|
collator.compare(mailPrefix, author.lastName)==0) |
|
{ |
|
LOG.info("Adding: "+mail+" to "+author); |
|
author.mails.add(mail.toLowerCase()); |
|
} |
|
else if( author.firstName.length()>1 && |
|
(mailPrefix.contains( author.firstName.toLowerCase()) || |
|
collator.compare(mailPrefix, author.firstName)==0)) |
|
{ |
|
LOG.info("Adding: "+mail+" to "+author); |
|
author.mails.add(mail.toLowerCase()); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
if(authors.isEmpty()) |
|
{ |
|
w.writeComment("No Author found"); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
|
|
Collections.sort(authors,new Comparator<Author>() |
|
{ |
|
@Override |
|
public int compare(Author o1, Author o2) |
|
{ |
|
int i= o2.pmids.size()-o1.pmids.size(); |
|
if(i!=0) return i; |
|
i= o2.factor-o1.factor;//later is more interesting ? not sure... |
|
return i; |
|
} |
|
}); |
|
|
|
|
|
authors.get(0).write(w); |
|
|
|
w.writeEndElement(); |
|
return 0; |
|
} |
|
|
|
public static void main(String[] args) |
|
{ |
|
try { |
|
BioStar4296 app= new BioStar4296(); |
|
int optind=0; |
|
while(optind<args.length) |
|
{ |
|
if(args[optind].equals("-h")) |
|
{ |
|
System.err.println("Pierre Lindenbaum"); |
|
System.err.println("Options:"); |
|
System.err.println(" -o <organism> ["+app.organism+"]"); |
|
System.err.println(" -v show logs"); |
|
return; |
|
} |
|
else if(args[optind].equals("-o")) |
|
{ |
|
app.organism=args[++optind]; |
|
} |
|
else if(args[optind].equals("-v")) |
|
{ |
|
app.LOG.setLevel(Level.ALL); |
|
} |
|
else if(args[optind].equals("--")) |
|
{ |
|
optind++; |
|
break; |
|
} |
|
else if(args[optind].startsWith("-")) |
|
{ |
|
System.err.println("Unnown option: "+args[optind]); |
|
return; |
|
} |
|
else |
|
{ |
|
break; |
|
} |
|
++optind; |
|
} |
|
if(optind==args.length) |
|
{ |
|
System.err.println("Gene Name missing"); |
|
} |
|
else |
|
{ |
|
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); |
|
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8"); |
|
w.writeStartDocument("UTF-8","1.0"); |
|
w.writeCharacters("\n"); |
|
w.writeStartElement("experts"); |
|
w.writeCharacters("\n"); |
|
while(optind < args.length) |
|
{ |
|
app.search(w,args[optind]); |
|
optind++; |
|
w.writeCharacters("\n"); |
|
} |
|
w.writeEndElement(); |
|
w.writeEndDocument(); |
|
w.flush(); |
|
} |
|
} catch (Exception e) |
|
{ |
|
e.printStackTrace(); |
|
} |
|
} |
|
} |
Actually, journal impact factor has nothing to do with the importance of individual articles. Common misconception :-) See http://altmetrics.org/manifesto/.
@Larry, most(all?) Genes in GeneWiki are Human Genes.
A serious issue that emerges is gene synonyms. One may need to consider species as well because the same gene in different organisms will function differently. So, Pierre's step 2 needs some refinement but nonetheless gets my vote.
Pierre -- Science doesn't sleep, so neither should you... ;)
Nice Q/A. You could also take into account the journal impact factor. This way the authors would be ranked by additionally relying on the "quality" of their work. Also, attention should be given to avoid mail receivers to mark the mail as spam.
As usual, I'm thoroughly impressed. I wonder how difficult it would be to adapt this to Google App Engine so we could call it as a web service. (On my first attempt to run locally, I get an error that is likely due to my complete java ignorance...) Any GAE experts in the audience?
I don't think that Google App engine would be the best place to run this service: there is a lot of I/O and it could be slow (for example in PRNP , 429 article were downloaded )
Bummer, was wondering if that would be an issue (as it is with my pubmed2wordle app). Anyway, thanks!
Amazing answer!!! Refining with Gene synonyms, Journal impact factor, Article views or downloads, Grants obtained by author (if any), works on similar genes, publication of invited reviews....