|
import java.awt.Dimension; |
|
import java.net.URLEncoder; |
|
import java.util.ArrayList; |
|
import java.util.List; |
|
import java.util.logging.Logger; |
|
|
|
import javax.swing.JOptionPane; |
|
import javax.swing.JScrollPane; |
|
import javax.swing.JTable; |
|
import javax.swing.ListSelectionModel; |
|
import javax.swing.table.DefaultTableModel; |
|
import javax.xml.parsers.DocumentBuilder; |
|
import javax.xml.parsers.DocumentBuilderFactory; |
|
import javax.xml.xpath.XPath; |
|
import javax.xml.xpath.XPathConstants; |
|
import javax.xml.xpath.XPathFactory; |
|
|
|
import org.w3c.dom.Document; |
|
import org.w3c.dom.NodeList; |
|
|
|
public class Biostar5460 |
|
{ |
|
private Logger LOG=Logger.getLogger("Biostar5460"); |
|
private class Item |
|
{ |
|
String id=""; |
|
String Prot_ref_desc=""; |
|
String Entrezgene_summary=""; |
|
String locus; |
|
Item(String id) |
|
{ |
|
this.id=id; |
|
} |
|
} |
|
|
|
private DocumentBuilder builder; |
|
private XPath xpath; |
|
private Biostar5460() throws Exception |
|
{ |
|
DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); |
|
factory.setCoalescing(true); |
|
factory.setNamespaceAware(false); |
|
factory.setExpandEntityReferences(true); |
|
factory.setValidating(false); |
|
factory.setIgnoringComments(true); |
|
factory.setIgnoringElementContentWhitespace(true); |
|
builder=factory.newDocumentBuilder(); |
|
|
|
this.xpath=XPathFactory.newInstance().newXPath(); |
|
} |
|
private void search(String term) throws Exception |
|
{ |
|
LOG.info(term); |
|
String uri="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&retmode=xml&tool=biostar5460" + |
|
"&mail=me_at_nowhere_com&term="+ |
|
URLEncoder.encode(term+" \"Homo sapiens\"[ORGN]","UTF-8"); |
|
LOG.info(uri); |
|
Document dom=builder.parse(uri); |
|
NodeList idList=(NodeList)this.xpath.evaluate("/eSearchResult/IdList/Id", dom, XPathConstants.NODESET); |
|
if(idList.getLength()==0) |
|
{ |
|
System.out.println("#NOT-FOUND\t"+term); |
|
return; |
|
} |
|
List<Item> array=new ArrayList<Item>(idList.getLength()); |
|
for(int i=0;i< idList.getLength();++i) |
|
{ |
|
LOG.info((i+1)+"/"+idList.getLength()); |
|
Item item=new Item(idList.item(i).getTextContent()); |
|
uri="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&retmax=100&id="+item.id; |
|
LOG.info(uri); |
|
dom=builder.parse(uri); |
|
item.locus=(String)xpath.evaluate("/Entrezgene-Set/Entrezgene/Entrezgene_gene/Gene-ref/Gene-ref_locus", dom,XPathConstants.STRING); |
|
item.Prot_ref_desc=(String)xpath.evaluate("/Entrezgene-Set/Entrezgene/Entrezgene_prot/Prot-ref/Prot-ref_desc", dom,XPathConstants.STRING); |
|
item.Entrezgene_summary=(String)xpath.evaluate("/Entrezgene-Set/Entrezgene/Entrezgene_summary", dom,XPathConstants.STRING); |
|
array.add(item); |
|
} |
|
if(array.size()==1) |
|
{ |
|
System.out.println(array.get(0).locus+"\t"+term); |
|
} |
|
else |
|
{ |
|
DefaultTableModel m=new DefaultTableModel(new String[]{"id","locus","desc","summary"}, array.size()); |
|
for(int i=0;i< array.size();++i) |
|
{ |
|
Item item=array.get(i); |
|
m.setValueAt(item.id, i, 0); |
|
m.setValueAt(item.locus, i, 1); |
|
m.setValueAt(item.Prot_ref_desc, i, 2); |
|
m.setValueAt(item.Entrezgene_summary, i, 3); |
|
} |
|
JTable table=new JTable(m); |
|
table.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); |
|
JScrollPane scroll=new JScrollPane(table); |
|
scroll.setPreferredSize(new Dimension(800,500)); |
|
if(JOptionPane.showConfirmDialog(null, scroll, |
|
"Select", |
|
JOptionPane.OK_CANCEL_OPTION,JOptionPane.QUESTION_MESSAGE,null) |
|
!=JOptionPane.OK_OPTION) |
|
{ |
|
System.out.println("#NOT-FOUND\t"+term); |
|
return; |
|
} |
|
if(table.getSelectedRow()==-1) |
|
{ |
|
System.out.println("#NOT-SELECTED\t"+term); |
|
return; |
|
} |
|
System.out.println(array.get(table.getSelectedRow()).locus+"\t"+term); |
|
} |
|
} |
|
public static void main(String[] args) |
|
{ |
|
try { |
|
Biostar5460 app=new Biostar5460(); |
|
for(int i=0;i< args.length;++i) |
|
{ |
|
app.search(args[i]); |
|
} |
|
} |
|
catch (Exception e) |
|
{ |
|
e.printStackTrace(); |
|
} |
|
} |
|
} |
I think a better title for this question might be "Retrieving official gene/protein symbols from full length gene/protein names automatically"
@Casey: Done :)
Thank you all for your comments and suggestions! Having the latest hot computer --> 2500$; One full run of 454 sequencing --> 6000$; Biostar Forum --> Priceless ;)