24 September 2010

Connecting to a MongoDB database from R using the C API for MongoDB

Today, Neil posted an article titled" Connecting to a MongoDB database from R using Java". In the current post, I'll show how to use the C API for MongoDB to fetch some MongoDB data from R. The code will be somehow similar to my previous post "A stateful C function for R: parsing Fasta sequences".

OK, First, let's add a few values in mongo:

for(i=1;i> 20;++i)
db.dbsnps.save({_id:"rs"+i,name:"rs"+i});


The C code contains 3 functions.

The first function mongoRconnect connects to the MongoDB server and put the pointer into a R variable.
SEXP mongoRconnect()
{
mongo_connection* conn; /* ptr */
mongo_connection_options opts[1];
mongo_conn_return status;

conn=(mongo_connection*)malloc(sizeof(mongo_connection));
strcpy( opts->host , "127.0.0.1" );
opts->port = 27017;
status = mongo_connect( conn, opts );

return R_MakeExternalPtr(conn, R_NilValue, R_NilValue);
}
The second method mongoRdiconnect closes the connection:
SEXP mongoRdiconnect(SEXP r_handle)
{
mongo_connection* conn;
conn = (mongo_connection*)R_ExternalPtrAddr(r_handle);;
if(conn==NULL) MONGO_ERROR("conn==NULL");
mongo_destroy( conn );
free(conn);
R_ClearExternalPtr(r_handle);
return ScalarInteger(0);
}
The last method mongoRquery scans the database test.dbsnps and inserts the name of the snps into an R array:
SEXP mongoRquery(SEXP r_handle)
{
SEXP values=NULL;
mongo_cursor *cursor;
bson empty[1];
bson_empty( empty );
int i;
mongo_connection* conn = R_ExternalPtrAddr(r_handle);
SEXP* array=NULL;
int array_size=0;
//the R value contains two objects


if(conn==NULL) MONGO_ERROR("handle==NULL");
cursor = mongo_find( conn,
"test.dbsnps",/* ns */
empty,/* fields */
empty,/* return */
0,/* return */
0,/* skip */
0 /* options */
);

while( mongo_cursor_next( cursor ) )
{
bson_iterator it[1];
if ( bson_find( it, &(cursor->current), "name" ))
{
array=(SEXP*)realloc(array,(array_size+1)*sizeof(SEXP));
if(array==NULL) error("out of memory");
array[array_size]=mkChar( bson_iterator_string( it ));
array_size++;
}
}
mongo_cursor_destroy( cursor );
PROTECT(values = allocVector(STRSXP, array_size));
for(i=0;i< array_size;++i)
{
SET_STRING_ELT(values, i, array[i]);
}
free(array);
UNPROTECT(1);
return values;
}
This C code is then be called from R:
mongo <- mongo.open()
mongo.snps(mongo)
mongo.close(mongo)

Result:
[1] "rs1" "rs2" "rs3" "rs4" "rs5" "rs6" "rs7" "rs8" "rs9" "rs10"
[11] "rs11" "rs12" "rs13" "rs14" "rs15" "rs16" "rs17" "rs18" "rs19"



Source code


Makefile

R_HOME=R-2.11.0
MONGO_HOME=mongo-c-driver
run:
gcc -fPIC -I -g -c -Wall -DMONGO_HAVE_STDINT -I ${R_HOME}/include -I ${MONGO_HOME}/src mongoR.c ${MONGO_HOME}/src/*.c
gcc -shared -Wl,-soname,rmongo.so.1 -o librmongo.so *.o
${R_HOME}/bin/R --no-save < mongo.R
clean:
rm *.o


mongoR.c

(again, I'm not sure about those PROTECT/UNPROTECT ...)
#include <ctype.h>
#include <errno.h>
#include <R.h>
#include <Rinternals.h>
#include <bson.h>
#include <mongo.h>

#define MONGO_ERROR(a) { error(a); fputs(a,stdout);exit(EXIT_FAILURE);}

/**
* connect to MONGO
*/
SEXP mongoRconnect()
{
mongo_connection* conn; /* ptr */
mongo_connection_options opts[1];
mongo_conn_return status;

conn=(mongo_connection*)malloc(sizeof(mongo_connection));
if(conn==NULL)
{
MONGO_ERROR("out of memory");
}

strcpy( opts->host , "127.0.0.1" );
opts->port = 27017;

status = mongo_connect( conn, opts );
if(status!= mongo_conn_success)
{
MONGO_ERROR("connection failed");
}

/** the handle is bound a R variable */
return R_MakeExternalPtr(conn, R_NilValue, R_NilValue);
}

/**
* close the mongo connection
*/
SEXP mongoRdiconnect(SEXP r_handle)
{
mongo_connection* conn;
conn = (mongo_connection*)R_ExternalPtrAddr(r_handle);;
if(conn==NULL) MONGO_ERROR("conn==NULL");
mongo_destroy( conn );
free(conn);
R_ClearExternalPtr(r_handle);
return ScalarInteger(0);
}

/**
* get all SNPS
*/
SEXP mongoRquery(SEXP r_handle)
{
SEXP values=NULL;
mongo_cursor *cursor;
bson empty[1];
bson_empty( empty );
int i;
mongo_connection* conn = R_ExternalPtrAddr(r_handle);
SEXP* array=NULL;
int array_size=0;
//the R value contains two objects


if(conn==NULL) MONGO_ERROR("handle==NULL");
cursor = mongo_find( conn,
"test.dbsnps",/* ns */
empty,/* fields */
empty,/* return */
0,/* return */
0,/* skip */
0 /* options */
);

while( mongo_cursor_next( cursor ) )
{
bson_iterator it[1];
if ( bson_find( it, &(cursor->current), "name" ))
{
array=(SEXP*)realloc(array,(array_size+1)*sizeof(SEXP));
if(array==NULL) error("out of memory");
array[array_size]=mkChar( bson_iterator_string( it ));
array_size++;
}
}
mongo_cursor_destroy( cursor );
PROTECT(values = allocVector(STRSXP, array_size));
for(i=0;i< array_size;++i)
{
SET_STRING_ELT(values, i, array[i]);
}
free(array);
UNPROTECT(1);
return values;
}


mongo.R

dyn.load(paste("librmongo", .Platform$dynlib.ext, sep=""))

mongo.open <- function()
{
.Call("mongoRconnect")
}

mongo.close <- function(handler)
{
.Call("mongoRdiconnect",handler)
}

mongo.snps <- function(handler)
{
.Call("mongoRquery",handler)
}

mongo <- mongo.open()
mongo.snps(mongo)
mongo.close(mongo)


That's it,
Pierre

22 September 2010

A Simple tool to get the sex ratio in pubmed.

Just for fun, I wrote a simple java tool to get the sex ratio of the authors in Pubmed. This program fetches a list of names/genders I found in the following perl module: http://cpansearch.perl.org/src/EDALY/Text-GenderFromName-0.33/GenderFromName.pm. The source code is available at

.

(In the following examples, the many names that couldn't be associated to a gender were ignored).

Bioinformatics


Here is the result for "Bioinformatics[journal]"
Women: 3178 (19%) Men: 13149 (80%)
Bioinformatics[Journal]


The 'Lancet' in 2009

Women: 579 (30%) Men: 1331 (69%)
Lancet[Journal] 2009[Date]


Nature in 2009

Women: 1616 (30%) Men: 3768 (69%)
Nature[Journal] 2009[Date]


Nursing in 2009

Women: 29 (70%) Men: 12 (29%)
Nursing[Journal] 2009[Date]



Articles about Charles Darwin

Women: 25 (17%) Men: 118 (82%)
"Darwin C"[PS]



etc... etc..

Source code

/**
* Author:
* Pierre Lindenbaum PhD
* plindenbaum@yahoo.fr
* Source of data:
* http://cpansearch.perl.org/src/EDALY/Text-GenderFromName-0.33/GenderFromName.pm
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLEncoder;
import java.text.Collator;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.stream.events.XMLEvent;

/**
* PubmedGender
*/
public class PubmedGender
{
private Map<String,Float> males=null;
private Map<String,Float> females=null;
private int limit=1000;
private String query="";
private int canvasSize=200;
private boolean ignoreUndefined=false;
private PubmedGender()
{
Collator collator= Collator.getInstance(Locale.US);
collator.setStrength(Collator.PRIMARY);
this.males=new TreeMap<String, Float>(collator);
this.females=new TreeMap<String, Float>(collator);
}

private void loadNames()
throws IOException
{
BufferedReader in=new BufferedReader(new InputStreamReader(new URL("http://cpansearch.perl.org/src/EDALY/Text-GenderFromName-0.33/GenderFromName.pm").openStream()));
String line;
Map<String,Float> map=null;
int posAssign=-1;
while((line=in.readLine())!=null)
{
if(line.startsWith("$Males = {"))
{
map=this.males;
}
else if(line.startsWith("$Females = {"))
{
map=this.females;
}
else if(line.contains("}"))
{
map=null;
}
else if(map!=null && ((posAssign=line.indexOf("=>"))!=-1))
{
String name=line.substring(0,posAssign).replaceAll("'","").toLowerCase().trim();
Float freq=Float.parseFloat(line.substring(posAssign+2).replaceAll("[',]","").toLowerCase().trim());
map.put(name, freq);
}
else
{
map=null;
}
}
in.close();
}
private XMLEventReader newReader(URL url) throws IOException,XMLStreamException
{
XMLInputFactory f= XMLInputFactory.newInstance();
f.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
f.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE,Boolean.FALSE);
f.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES,Boolean.TRUE);
f.setProperty(XMLInputFactory.IS_VALIDATING,Boolean.FALSE);
f.setProperty(XMLInputFactory.SUPPORT_DTD,Boolean.FALSE);
XMLEventReader reader=f.createXMLEventReader(url.openStream());
return reader;
}

private void run() throws Exception
{
int countMales=0;
int countFemales=0;
int countUnknown=0;

URL url= new URL(
"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term="+
URLEncoder.encode(this.query, "UTF-8")+
"&retstart=0&retmax="+this.limit+"&usehistory=y&retmode=xml&email=plindenbaum_at_yahoo.fr&tool=gender");

XMLEventReader reader= newReader(url);
XMLEvent evt;
String QueryKey=null;
String WebEnv=null;
int countId=0;
while(!(evt=reader.nextEvent()).isEndDocument())
{
if(!evt.isStartElement()) continue;
String tag= evt.asStartElement().getName().getLocalPart();
if(tag.equals("QueryKey"))
{
QueryKey= reader.getElementText().trim();
}
else if(tag.equals("WebEnv"))
{
WebEnv= reader.getElementText().trim();
}
else if(tag.equals("Id"))
{
++countId;
}
}
reader.close();

if(countId!=0)
{
url= new URL("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv="+
URLEncoder.encode(WebEnv,"UTF-8")+
"&query_key="+URLEncoder.encode(QueryKey,"UTF-8")+
"&retmode=xml&retmax="+this.limit+"&email=plindenbaum_at_yahoo.fr&tool=mail");

reader= newReader(url);


while(reader.hasNext())
{
evt=reader.nextEvent();
if(!evt.isStartElement()) continue;
if(!evt.asStartElement().getName().getLocalPart().equals("Author")) continue;
String firstName=null;
String initials=null;

while(reader.hasNext())
{
evt=reader.nextEvent();
if(evt.isStartElement())
{
String localName=evt.asStartElement().getName().getLocalPart();
if(localName.equals("ForeName") || localName.equals("FirstName"))
{
firstName=reader.getElementText().toLowerCase();
}
else if(localName.equals("Initials"))
{
initials=reader.getElementText().toLowerCase();
}
}
else if(evt.isEndElement())
{
if(evt.asEndElement().getName().getLocalPart().equals("Author")) break;
}
}
if( firstName==null ) continue;
if( firstName.length()==1 ||
firstName.equals(initials)) continue;

String tokens[]=firstName.split("[ ]+");
firstName="";
for(String s:tokens)
{
if(s.length()> firstName.length())
{
firstName=s;
}
}


if( firstName.length()==1 ||
firstName.equals(initials)) continue;

Float male= this.males.get(firstName);
Float female= this.females.get(firstName);

if(male==null && female==null)
{
//System.err.println("Undefined "+firstName+" / "+lastName);
countUnknown++;
}
else if(male!=null && female==null)
{
countMales++;
}
else if(male==null && female!=null)
{
countFemales++;
}
else if(male < female)
{
countFemales++;
}
else if(female < male)
{
countMales++;
}
else
{
//System.err.println("Undefined "+firstName+" / "+lastName);
countUnknown++;
}
}
reader.close();
}
if(ignoreUndefined) countUnknown=0;

float total= countMales+countFemales+countUnknown;

double radMale=(countMales/total)*Math.PI*2.0;
double radFemale=(countFemales/total)*Math.PI*2.0;
int radius= (canvasSize-2)/2;
String id= "ctx"+System.currentTimeMillis()+""+(int)(Math.random()*1000);
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8");
w.writeStartElement("html");
w.writeStartElement("body");
w.writeStartElement("div");
w.writeAttribute("style","margin:10px;padding:10px;text-align:center;");
w.writeStartElement("div");
w.writeEmptyElement("canvas");
w.writeAttribute("width", String.valueOf(canvasSize+1));
w.writeAttribute("height", String.valueOf(canvasSize+1));
w.writeAttribute("id", id);
w.writeStartElement("script");
w.writeCharacters(
"function paint"+id+"(){var canvas=document.getElementById('"+id+"');"+
"if (!canvas.getContext) return;var c=canvas.getContext('2d');"+
"c.fillStyle='white';c.strokeStyle='black';"+
"c.fillRect(0,0,"+canvasSize+","+canvasSize+");"+
"c.fillStyle='gray';c.beginPath();c.arc("+(canvasSize/2)+","+(canvasSize/2)+","+radius+",0,Math.PI*2,true);c.fill();c.stroke();"+
"c.fillStyle='blue';c.beginPath();c.moveTo("+(canvasSize/2)+","+(canvasSize/2)+");c.arc("+(canvasSize/2)+","+(canvasSize/2)+","+radius+",0,"+radMale+",false);c.closePath();c.fill();c.stroke();"+
"c.fillStyle='pink';c.beginPath();c.moveTo("+(canvasSize/2)+","+(canvasSize/2)+");c.arc("+(canvasSize/2)+","+(canvasSize/2)+","+radius+","+radMale+","+(radMale+radFemale)+",false);c.closePath();c.fill();c.stroke();}"+
"window.addEventListener('load',function(){ paint"+id+"(); },true);"
);
w.writeEndElement();
w.writeEndElement();

w.writeStartElement("span");
w.writeAttribute("style","color:pink;");
w.writeCharacters("Women: "+countFemales+" ("+(int)((countFemales/total)*100.0)+"%)");
w.writeEndElement();
w.writeCharacters(" ");
w.writeStartElement("span");
w.writeAttribute("style","color:blue;");
w.writeCharacters("Men: "+countMales+" ("+(int)((countMales/total)*100.0)+"%)");
w.writeEndElement();
w.writeCharacters(" ");

if(!this.ignoreUndefined)
{
w.writeStartElement("span");
w.writeAttribute("style","color:gray;");
w.writeCharacters("Undefined : "+countUnknown+" ("+(int)((countUnknown/total)*100.0)+"%)");
w.writeEndElement();
}
w.writeEmptyElement("br");

w.writeStartElement("a");
w.writeAttribute("target","_blank");
w.writeAttribute("href","http://www.ncbi.nlm.nih.gov/sites/entrez?db=pubmed&amp;cmd=search&amp;term="+URLEncoder.encode(this.query,"UTF-8"));
w.writeCharacters(this.query);
w.writeEndElement();


w.writeEndElement();
w.writeEndElement();
w.writeEndElement();
w.flush();
w.close();
}

public static void main(String[] args)
{
try
{
PubmedGender app=new PubmedGender();

int optind=0;
while(optind< args.length)
{
if(args[optind].equals("-h") ||
args[optind].equals("-help") ||
args[optind].equals("--help"))
{
System.err.println("Options:");
System.err.println(" -h help; This screen.");
System.err.println(" -w <int> canvas size default:"+app.canvasSize);
System.err.println(" -L <int> limit number default:"+app.limit);
System.err.println(" -i ignore undefined default:"+app.ignoreUndefined);
System.err.println(" query terms...");
return;
}
else if(args[optind].equals("-L"))
{
app.limit=Integer.parseInt(args[++optind]);
}
else if(args[optind].equals("-w"))
{
app.canvasSize=Integer.parseInt(args[++optind]);
}
else if(args[optind].equals("-i"))
{
app.ignoreUndefined=true;
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unknown option "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
if(optind==args.length)
{
System.err.println("Query missing");
return;
}
app.query="";
while(optind< args.length)
{
if(!app.query.isEmpty()) app.query+=" ";
app.query+=args[optind++];
}
app.query=app.query.trim();
if(app.query.trim().isEmpty())
{
System.err.println("Query is empty");
return;
}
app.loadNames();

app.run();

}
catch (Exception e)
{
e.printStackTrace();
}
}
}


That's it

Pierre

21 September 2010

Trees in Mongodb, my notebook with Gene Ontology

In the current post I've loaded the Gene Ontology into MongoDB and played with the tree structure of the database:

Loading GeneOntology into MongoDB

First, download GO as RDF at http://archive.geneontology.org/latest-termdb/go_daily-termdb.rdf-xml.gz and transform it with my XSLT stylesheet go2mongo.xsl (available here):
<?xml version='1.0' encoding="UTF-8" ?>
<xsl:stylesheet
xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
xmlns:go="http://www.geneontology.org/dtds/go.dtd#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
version='1.0'
>
<xsl:output method="text"/>

<xsl:param name="colName">go</xsl:param>

<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>

<xsl:template match="go:go">
<xsl:apply-templates select="rdf:RDF"/>
</xsl:template>

<xsl:template match="rdf:RDF">

db.<xsl:value-of select="$colName"/>.drop();

<xsl:apply-templates select="go:term"/>


</xsl:template>

<xsl:template match="go:term">
<xsl:text>term={_id:</xsl:text><xsl:apply-templates select="go:accession" mode="text"/>
<xsl:text>,name:</xsl:text><xsl:apply-templates select="go:name" mode="text"/>
<xsl:if test="go:synonym">
<xsl:text>,synonyms:[</xsl:text>
<xsl:for-each select="go:synonym">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:definition">
<xsl:text>,definition:</xsl:text>
<xsl:apply-templates select="go:definition" mode="text"/>
</xsl:if>

<xsl:if test="go:comment">
<xsl:text>,comments:[</xsl:text>
<xsl:for-each select="go:comment">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:part_of">
<xsl:text>,part_of:[</xsl:text>
<xsl:for-each select="go:part_of">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:is_a">
<xsl:text>,is_a:[</xsl:text>
<xsl:for-each select="go:is_a">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:negatively_regulates">
<xsl:text>,negatively_regulates:[</xsl:text>
<xsl:for-each select="go:negatively_regulates">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:positively_regulates">
<xsl:text>,positively_regulates:[</xsl:text>
<xsl:for-each select="go:positively_regulates">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:regulates">
<xsl:text>,regulates:[</xsl:text>
<xsl:for-each select="go:regulates">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:dbxref">
<xsl:text>,dbxrefs:[</xsl:text>
<xsl:for-each select="go:dbxref">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="."/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:association">
<xsl:text>,associations:[</xsl:text>
<xsl:for-each select="go:association">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:text>{evidences:[</xsl:text>
<xsl:for-each select="go:evidence">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>],gene_product:{name:</xsl:text>
<xsl:apply-templates select="go:gene_product/go:name" mode="text"/>
<xsl:text>,dbxref:</xsl:text>
<xsl:apply-templates select="go:gene_product/go:dbxref" />
<xsl:text>}}</xsl:text>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:is_obsolete">
<xsl:text>,is_obsolete:[</xsl:text>
<xsl:for-each select="go:is_obsolete">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>
<xsl:text>};
db.</xsl:text>
<xsl:value-of select="$colName"/>
<xsl:text>.save(term);
</xsl:text>
</xsl:template>

<xsl:template match="go:dbxref">
<xsl:text>{database_symbol:</xsl:text>
<xsl:apply-templates select="go:database_symbol" mode="text"/>
<xsl:text>,reference:</xsl:text>
<xsl:apply-templates select="go:reference" mode="text"/>
<xsl:text>}</xsl:text>
</xsl:template>

<xsl:template match="*" mode="text">
<xsl:text>&quot;</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="."/>
</xsl:call-template>
<xsl:text>&quot;</xsl:text>
</xsl:template>

<xsl:template match="@rdf:resource">
<xsl:text>{&apos;$ref&apos;:&apos;</xsl:text>
<xsl:value-of select="$colName"/>
<xsl:text>&apos;,&apos;$id&apos;:&apos;</xsl:text>
<xsl:value-of select="substring-after(.,'#')"/>
<xsl:text>&apos;}</xsl:text>
</xsl:template>


<xsl:template name="escape">
<xsl:param name="s"/>
<xsl:choose>
<xsl:when test="contains($s,'&quot;')">
<xsl:value-of select="substring-before($s,'&quot;')"/>
<xsl:text>\&quot;</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="substring-after($s,'&quot;')"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$s"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>



</xsl:stylesheet>


unzip and transform 'go_daily-termdb.rdf-xml' with the stylesheet to generate the javascript:
xsltproc go2mongo.xsl go_daily-termdb.rdf-xml > input.js
The file input.js looks like this:
term={
_id:"GO:0000001",
name:"mitochondrion inheritance",
synonyms:["mitochondrial inheritance"],
definition:"The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.",
is_a:[
{'$ref':'go','$id':'GO:0048308'},
{'$ref':'go','$id':'GO:0048311'}
]
};
db.go.save(term);
term={_id:"GO:0000002",name:"mitochondrial genome maintenance",definition:"The maintenance of the structure and integrity of the mitochondrial genome; includes replication and segregation of the mitochondrial chromosome.",is_a:[{'$ref':'go','$id':'GO:0007005'}],dbxrefs:[{database_symbol:"InterPro",reference:"IPR009446"},{database_symbol:"Pfam",reference:"PF06420"}]};
db.go.save(term);
term={_id:"GO:0000003",name:"reproduction",synonyms:["GO:0019952","GO:0050876","reproductive physiological process"],definition:"The production by an organism of new individuals that contain some portion of their genetic material inherited from that organism.",is_a:[{'$ref':'go','$id':'GO:0008150'}],dbxrefs:[{database_symbol:"Wikipedia",reference:"Reproduction"}]};
db.go.save(term);
term={_id:"GO:0000005",name:"ribosomal chaperone activity",definition:"OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function.",comments:["This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular function."],is_a:[{'$ref':'go','$id':'obsolete_molecular_function'}]};
db.go.save(term);
term={_id:"GO:0042254",name:"ribosome biogenesis",synonyms:["GO:0007046","ribosomal chaperone activity","ribosome biogenesis and assembly"],definition:"The process of the formation of the constituents of the ribosome subunits, their assembly, and their transport to the sites of protein synthesis.",is_a:[{'$ref':'go','$id':'GO:0022613'}],dbxrefs:[{database_symbol:"InterPro",reference:"IPR001790"},{database_symbol:"InterPro",reference:"IPR004037"},{database_symbol:"InterPro",reference:"IPR007023"},{database_symbol:"InterPro",reference:"IPR012948"},{database_symbol:"SP_KW",reference:"KW-0690"},{database_symbol:"HAMAP",reference:"MF_00554"},{database_symbol:"HAMAP",reference:"MF_00699"},{database_symbol:"HAMAP",reference:"MF_00803"},{database_symbol:"HAMAP",reference:"MF_01852"},{database_symbol:"Pfam",reference:"PF00466"},{database_symbol:"Pfam",reference:"PF04939"},{database_symbol:"Pfam",reference:"PF08142"},{database_symbol:"PROSITE",reference:"PS01082"},{database_symbol:"Wikipedia",reference:"Ribosome_biogenesis"},{database_symbol:"SMART",reference:"SM00785"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00436"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR01575"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR02729"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03594"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03596"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03597"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03598"}]};
db.go.save(term);
term={_id:"GO:0044183",name:"protein binding involved in protein folding",synonyms:["chaperone activity"],definition:"Interacting selectively and non-covalently with any protein or protein complex (a complex of two or more proteins that may include other nonprotein molecules) that contributes to the process of protein folding.",is_a:[{'$ref':'go','$id':'GO:0005515'}]};
db.go.save(term);
term={_id:"GO:0051082",name:"unfolded protein binding",synonyms:["binding unfolded ER proteins","chaperone activity","fimbrium-specific chaperone activity","glycoprotein-specific chaperone activity","histone-specific chaperone activity","ribosomal chaperone activity","tubulin-specific chaperone activity"],definition:"Interacting selectively and non-covalently with an unfolded protein.",is_a:[{'$ref':'go','$id':'GO:0005515'}],dbxrefs:[{database_symbol:"InterPro",reference:"IPR000397"},{database_symbol:"InterPro",reference:"IPR001305"},{database_symbol:"InterPro",reference:"IPR001404"},{database_symbol:"InterPro",reference:"IPR002194"},{database_symbol:"InterPro",reference:"IPR002777"},{database_symbol:"InterPro",reference:"IPR002939"},{database_symbol:"InterPro",reference:"IPR003095"},{database_symbol:"InterPro",reference:"IPR003708"},{database_symbol:"InterPro",reference:"IPR004127"},{database_symbol:"InterPro",reference:"IPR004226"},{database_symbol:"InterPro",reference:"IPR004487"},{database_symbol:"InterPro",reference:"IPR004961"},{database_symbol:"InterPro",reference:"IPR008971"},{database_symbol:"InterPro",reference:"IPR009033"},{database_symbol:"InterPro",reference:"IPR009169"},{database_symbol:"InterPro",reference:"IPR010236"},{database_symbol:"InterPro",reference:"IPR011599"},{database_symbol:"InterPro",reference:"IPR012713"},{database_symbol:"InterPro",reference:"IPR012714"},{database_symbol:"InterPro",reference:"IPR012715"},{database_symbol:"InterPro",reference:"IPR012716"},{database_symbol:"InterPro",reference:"IPR012717"},{database_symbol:"InterPro",reference:"IPR012718"},{database_symbol:"InterPro",reference:"IPR012719"},{database_symbol:"InterPro",reference:"IPR012720"},{database_symbol:"InterPro",reference:"IPR012721"},{database_symbol:"InterPro",reference:"IPR012722"},{database_symbol:"InterPro",reference:"IPR012724"},{database_symbol:"InterPro",reference:"IPR012725"},{database_symbol:"InterPro",reference:"IPR016153"},{database_symbol:"InterPro",reference:"IPR016154"},{database_symbol:"InterPro",reference:"IPR019805"},{database_symbol:"HAMAP",reference:"MF_00117"},{database_symbol:"PROSITE",reference:"MF_00117"},{database_symbol:"HAMAP",reference:"MF_00175"},{database_symbol:"PROSITE",reference:"MF_00175"},{database_symbol:"HAMAP",reference:"MF_00307"},{database_symbol:"PROSITE",reference:"MF_00307"},{database_symbol:"HAMAP",reference:"MF_00308"},{database_symbol:"PROSITE",reference:"MF_00308"},{database_symbol:"PROSITE",reference:"MF_00332"},{database_symbol:"HAMAP",reference:"MF_00505"},{database_symbol:"PROSITE",reference:"MF_00505"},{database_symbol:"HAMAP",reference:"MF_00600"},{database_symbol:"PROSITE",reference:"MF_00679"},{database_symbol:"HAMAP",reference:"MF_00790"},{database_symbol:"PROSITE",reference:"MF_00821"},{database_symbol:"HAMAP",reference:"MF_00822"},{database_symbol:"HAMAP",reference:"MF_01046"},{database_symbol:"HAMAP",reference:"MF_01152"},{database_symbol:"PROSITE",reference:"MF_01152"},{database_symbol:"HAMAP",reference:"MF_01183"},{database_symbol:"ProDom",reference:"PD010430"},{database_symbol:"Pfam",reference:"PF00684"},{database_symbol:"Pfam",reference:"PF01430"},{database_symbol:"Pfam",reference:"PF01556"},{database_symbol:"Pfam",reference:"PF01920"},{database_symbol:"Pfam",reference:"PF02556"},{database_symbol:"Pfam",reference:"PF02970"},{database_symbol:"Pfam",reference:"PF02996"},{database_symbol:"Pfam",reference:"PF03280"},{database_symbol:"PIRSF",reference:"PIRSF002356"},{database_symbol:"PIRSF",reference:"PIRSF002583"},{database_symbol:"PIRSF",reference:"PIRSF005261"},{database_symbol:"PRINTS",reference:"PR00625"},{database_symbol:"PRINTS",reference:"PR01594"},{database_symbol:"PROSITE",reference:"PS00298"},{database_symbol:"PROSITE",reference:"PS00750"},{database_symbol:"PROSITE",reference:"PS00751"},{database_symbol:"PROSITE",reference:"PS00995"},{database_symbol:"PROSITE",reference:"PS51188"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00074"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00115"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00382"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00809"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR02350"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03142"}]};
db.go.save(term);
term={_id:"GO:0000006",name:"high affinity zinc uptake transmembrane transporter activity",definition:"Catalysis of the transfer of a solute or solutes from one side of a membrane to the other according to the reaction: Zn2+(out) = Zn2+(in), probably powered by proton motive force. In high affinity transport the transporter is able to bind the solute even if it is only present at very low concentrations.",is_a:[{'$ref':'go','$id':'GO:0005385'}]};
db.go.save(term);
(...)
Here the notation {'$ref':'go','$id':'GO:0048308'} is a special object interpreted by mongo as a "Database Reference", a kind of forein-key/link/pointer to another document with a special method named 'fetch' retrieving the linked document.

Load 'input.js' into mongo
mongo mygodatabase input.js

Playing with the GeneOntology Tree

I'm going to look if a go-term is a descendant of one another. First, let's define two useful javascript recursive functions looking for the parent(s) of a given node threw the property is_a.
var goNodeIsA= function (childNode, parentId) {
if (childNode == null) {
return false;
}
if (childNode._id == parentId) {
return true;
}
if (!childNode.is_a) {
return false;
}
for (var i = 0; i < childNode.is_a.length; ++i) {
if (goNodeIsA(childNode.is_a[i].fetch(), parentId))
{
return true;
}
}
return false;
}

var goIsA=function (childId, parentId)
{
return goNodeIsA(db.go.findOne({_id:childId}), parentId);
}

Now, let's find if GO:0003723 (RNA binding) is a descendant of GO:0005488 (binding) ?
> goIsA("GO:0003723","GO:0005488");
true

And is GO:0003723 (RNA binding) is a descendant of GO:0050355 (triphosphatase activity) ?
> goIsA("GO:0003723","GO:0050355");
false

Loop over all the GO terms and find the descendants of GO:0050355 (triphosphatase activity):
> db.go.find({},{name:1,is_a:1}).forEach(function(term) { if(goIsA(term._id,'GO:0005488')) printjson(term); })

(...)
{
"_id" : "GO:0080084",
"name" : "5S rDNA binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0000182"
}
]
}
{
"_id" : "GO:0080087",
"name" : "callose binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0030247"
}
]
}
{
"_id" : "GO:0080115",
"name" : "myosin XI tail binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0032029"
}
]
}
{
"_id" : "GO:0090079",
"name" : "translation regulator activity, nucleic acid binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0003676"
},
{
"$ref" : "go",
"$id" : "GO:0045182"
}
]
}
(...)




That's it

Pierre

19 September 2010

Indexing some genomic positions with MongoDB: my benchmark

The aim of this post was to find a good way to index some genomic positions with mongodb. (Update: it was tested on my laptop without replication )

My initial dataset is a list of SNPs on the chromosomes chr22 and chrM from the UCSC.

mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 \
-e 'select chrom,chromStart,name from snp130 where chrom in("chr22","chrM")'


From this dataset, a javascript input for mongodb was generated using the following AWK scripts:
awk -f data2mongo.awk dataset.xls > mongo.js
Each script runs a loop searching some SNPs in a random range on the chr22.
mongo bio mongo.js


The winner: Test 1

Indexing both fields:db.things.ensureIndex({chrom:1,position:1}):
BEGIN {
printf("db.snps.drop();\n");
}

{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("db.snps.ensureIndex({chrom:1,position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

result: seconds:0.228

Test 2

Indexing each field : db.things.ensureIndex({chrom:1}); and db.things.ensureIndex({position:1});.
BEGIN {
printf("db.snps.drop();\n");
}

{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("db.snps.ensureIndex({chrom:1});\n");
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

Result: seconds:0.25



Test 3


No index.


BEGIN {
printf("db.snps.drop();\n");
}

{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

Result:seconds:277.751



Test 4

String padding:the chromosome and the position are concatenated in a fixed-length string


BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
Result: seconds:169.028

Test 5

string padding + index


BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

Result: seconds:0.292



Test 6

_id as a padded string as _id

.
BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({_id:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

result : seconds:1.252



Test 7

Using a padded string for _id and min()/max() for searching
BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find().min({_id:pad(\"22\",pos1)}).max({_id:pad(\"22\",pos2)});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

result Seconds:3.189



Test 8

I also used a composite _id: db.snps.save({_id:{chrom:"chr22",position:14430966},name:"rs2844899"}); but i was not able to query this table with $gt/$lt:


> db.snps.find({_id:{chrom:"chr22",position:{$gt:14430966}}}).count()
0
> db.snps.find({_id:{chrom:"chr22",position:14430966}}).count()
1


Any other idea for indexing those data ? Feel free to leave a message here or on biostar or stackoverflow.


That's it

Pierre

16 September 2010

Using MongoDB with Apache Tomcat, searching for Pubmed articles

The current post continues the previous one titled "
MongoDB and NCBI pubmed: Inserting, searching and updating. My notebook.
".

Here I've used the Java API for MongoDB/BSON in Apache Tomcat, the java servlet container to search and display some pubmed papers stored in mongo via a web interface. At the end the result looks like this:


The servlet

The servlet uses the JAVA API for mongo.

  • An instance of Mongo is created and it connects to the mongodb server
  • We obtain an object DB from the server for the database 'pubmed'
  • we get an object DBCollection from the database
  • If the user provided a valid pmid, the new query is created:
    BasicDBObject query=new BasicDBObject("_id",Long.parseLong(pmid))
    and it is used to search the collection
    BObject article=collection.findOne(query);
  • If the article was found, it was injected in the request and forwarded to the JSP named 'article.jsp'

File: src/WEB-INF/src/fr/inserm/umr915/biomongo/BioMongoServlet.java
package fr.inserm.umr915.biomongo;

import java.io.IOException;
import javax.servlet.ServletConfig;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.ServerAddress;

@SuppressWarnings("serial")
public class BioMongoServlet extends HttpServlet
{
/** mongodb host */
private String mongoHost=ServerAddress.defaultHost();
/** mongodb port */
private int mongoPort=ServerAddress.defaultPort();

@Override
public void init(ServletConfig config) throws ServletException {
super.init(config);
String param=config.getInitParameter("mongo.port");
if(param!=null) this.mongoPort=Integer.parseInt(param);
param=config.getInitParameter("mongo.host");
if(param!=null) this.mongoHost=param;
}

@Override
protected void service(HttpServletRequest req, HttpServletResponse res)
throws ServletException, IOException
{
Mongo mongo=null;
try {
//connect to mongo
mongo = new Mongo(this.mongoHost,this.mongoPort);
//get the database pubmed
DB database=mongo.getDB("pubmed");
if(database==null) throw new ServletException("canot get database");
//get the collection 'articles'
DBCollection col = database.getCollection("articles");
if(col==null) throw new ServletException("canot get collection");
//get the query parameter 'pmid'
String pmid=req.getParameter("pmid");
String jspPage="/WEB-INF/jsp/index.jsp";
//if pmid exist and looks like a number
if(pmid!=null && pmid.matches("[0-9]+"))
{
//find
DBObject article=col.findOne(new BasicDBObject("_id",Long.parseLong(pmid)));
if(article==null)
{
req.setAttribute("message", "Cannot find pmid:"+pmid);
}
else
{
jspPage="/WEB-INF/jsp/article.jsp";
req.setAttribute("article",article);
}
}
req.getRequestDispatcher(jspPage).forward(req, res);
}
catch(ServletException e)
{
throw e;
}
catch(IOException e)
{
throw e;
}
catch(Exception e)
{
throw new ServletException(e);
}
finally
{
//cleanup mongo
if(mongo!=null)
{
mongo.close();
}
mongo=null;
}
}
}

article.jsp

article.jsp displays the article. As this object is an instance of BasicDBObject (that is to say a Map and/or a List) it was very easy to use it with the JSP technology and the Java Standard Tag Library (JSTL).

File: ./src/WEB-INF/jsp/article.jsp
<%@page contentType="text/html" %>
<%@ taglib prefix="c" uri="http://java.sun.com/jstl/core" %>
<%@ taglib prefix="mongo" tagdir="/WEB-INF/tags" %>
<% out.clearBuffer(); %><html><head>
<title>PMID:<c:out value="${param['pmid']}" escapeXml="true"/></title>
</head>
<body>
<h1>PMID:<c:out value="${param['pmid']}" escapeXml="true"/></h1>
<h2><c:out value="${article.title}" escapeXml="true"/></h2>
<dl>
<dt>Date</dt><dd><mongo:dbobject object="${article.created}"/></dd>
<dt>Authors</dt><dd><ul><c:forEach var="author" items="${article.authors}">
<li>
<mongo:dbobject object="${author}"/>
</li>
</c:forEach></ul></dd>
<dt>Journal</dt><dd><mongo:dbobject object="${article.journal}"/></dd>
<dt>Mesh</dt><dd><ul><c:forEach var="term" items="${article.mesh}">
<li>
<c:out value="${term}" escapeXml="true"/>
</li>
</c:forEach></ul></dd>
</dl>


</body>
</html>

dbobject.tag

The previous JSP page calls a custom tag file <mongo:dbobject> which make a 'pretty display' of a BSON object by serializing it with com.mongodb.util.JSON.serialize(Object o)

File: ./src/WEB-INF/tags/dbobject.tag
<%@ tag language="java" pageEncoding="UTF-8"%>
<%@ taglib prefix="c" uri="http://java.sun.com/jstl/core" %>
<%@ taglib prefix="fn" uri="http://java.sun.com/jsp/jstl/functions" %>
<%@tag import="com.mongodb.util.JSON"%>
<%@tag import="com.mongodb.DBObject"%>
<%@attribute name="object" required="true" rtexprvalue="true" type="java.lang.Object"%>
<c:set var="bson"><%= JSON.serialize(this.object) %></c:set>

<div style="background-color:lightgray;"><c:out escapeXml="true" value="${bson}"/></div>


Other Files


index.jsp

a simple form for searching an article.

File: ./src/WEB-INF/jsp/index.jsp
<%@page contentType="text/html" %>
<%@ taglib prefix="c" uri="http://java.sun.com/jstl/core" %>
<% out.clearBuffer(); %><html><head>
<title>Search PMID</title>
</head>
<body>
<form method="GET" action="${pageContext.request.contextPath}/biomongo" >
<div style=" position:absolute;width:400px;height:100px;left:50%;top:50%;margin-left:-100px;margin-top:-50px;">
<c:if test="${not empty message}">
<div style="text-align:center; color:red;"><c:out value="${message}" escapeXml="true"/></div>
</c:if>
<input name="pmid"/><input type="submit"/>
</div>
</form></body>
</html>

web.xml

the deployement descriptor.

File: ./src/WEB-INF/web.xml
<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns="http://java.sun.com/xml/ns/javaee"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd"
version="2.5">

<description>Biomongo Servlet</description>
<display-name>Biomongo</display-name>


<servlet>
<display-name>Biomongo Servlet</display-name>
<servlet-name>biomongo</servlet-name>
<servlet-class>fr.inserm.umr915.biomongo.BioMongoServlet</servlet-class>

<load-on-startup>1</load-on-startup>
</servlet>

<servlet-mapping>
<servlet-name>biomongo</servlet-name>
<url-pattern>/biomongo</url-pattern>
</servlet-mapping>


<jsp-config>
<taglib>
<taglib-uri>http://java.sun.com/jstl/fmt</taglib-uri>
<taglib-location>/WEB-INF/tld/fmt.tld</taglib-location>
</taglib>


<taglib>
<taglib-uri>http://java.sun.com/jstl/core</taglib-uri>
<taglib-location>/WEB-INF/tld/c.tld</taglib-location>
</taglib>



<taglib>
<taglib-uri>http://java.sun.com/jstl/sql</taglib-uri>
<taglib-location>/WEB-INF/tld/sql.tld</taglib-location>
</taglib>



<taglib>
<taglib-uri>http://java.sun.com/jstl/x</taglib-uri>
<taglib-location>/WEB-INF/tld/x.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/functions</taglib-uri>
<taglib-location>/WEB-INF/tld/fn.tld</taglib-location>
</taglib>


</jsp-config>

</web-app>

An ANT file for building the project


File: build.xml
<?xml version="1.0" encoding="UTF-8"?>
<project default="biomongo">
<property file="build.properties"/>
<property name="root.dir" value="."/>

<property file="${rootdir}/build.properties"/>


<path id="mongo.path">
<pathelement location="${mongodb.lib}"/>
</path>

<path id="servlet.path">
<pathelement location="${tomcat.servlet.api}"/>
<pathelement location="${tomcat.jsp.api}"/>
</path>


<target name="biomongo">
<property name="base.dir" value="${root.dir}/biomongo"/>
<mkdir dir="${base.dir}/src/WEB-INF/lib"/>
<mkdir dir="${base.dir}/src/WEB-INF/classes"/>

<copy todir="${base.dir}/src/WEB-INF/lib" includeEmptyDirs="false">
<fileset file="${mongodb.lib}"/>
<fileset dir="${taglib.dir}/lib" includes="*.jar"/>
</copy>

<copy todir="${base.dir}/src/WEB-INF/tld" includeEmptyDirs="false">
<fileset dir="${taglib.dir}/tld" includes="*.tld"/>
</copy>


<javac srcdir="${base.dir}/src/WEB-INF/src"
destdir="${base.dir}/src/WEB-INF/classes"
debug="true"
source="1.6"
target="1.6">
<classpath>
<path refid="mongo.path"/>
<path refid="servlet.path"/>
</classpath>
<sourcepath>
<pathelement location="${base.dir}/src/WEB-INF/src"/>
</sourcepath>
<include name="**/BioMongoServlet.java"/>
</javac>

<jar destfile="${tomcat.dir}/webapps/biomongo.war"
basedir="${base.dir}/src">
</jar>

</target>

</project>

Result





That's it.

Pierre

15 September 2010

MongoDB and NCBI pubmed: Inserting, searching and updating. My notebook.

After Neil and Jan, it is now my turn to play with MongoDB.
In this post, I've inserted some NCBI/PUBMED records into Mongo and tested various queries for selecting and updating the entries.

The initial set was a list of ~300 records where Charles Darwin was cited as a "[PersonalNameSubject]": http://www.ncbi.nlm.nih.gov/sites/entrez?db=pubmed&cmd=search&term=%22Darwin+C%22[ps]. This set was saved as XML and transformed to an input for Mongo using the following XSLT stylesheet:(code available here: http://openwetware.org/wiki/Image:Pubmed2mongo.xsl)

<?xml version="1.0" encoding="UTF-8"?>



<xsl:stylesheet xmlns:xsl='http://www.w3.org/1999/XSL/Transform' version="1.0">

<!--

This stylesheet transforms one or more Pubmed
Article in xml format into JSON for mongodb

Author: Pierre Lindenbaum PhD plindenbaum@yahoo.fr

-->

<xsl:output method="text" encoding="UTF-8"/>


<xsl:template match="/">
<xsl:text>
db.articles.drop();
</xsl:text>
<xsl:apply-templates/>

<xsl:text>
db.articles.ensureIndex({pmid:1}, {unique: true});
db.articles.ensureIndex({created:1});
db.articles.ensureIndex({authors:1});
db.articles.ensureIndex({mesh:1});
db.articles.ensureIndex({journal:1});


</xsl:text>

</xsl:template>

<xsl:template match="PubmedArticleSet">
<xsl:apply-templates select="PubmedArticle"/>
</xsl:template>



<xsl:template match="PubmedArticle">
<xsl:text>
article={
</xsl:text>
<xsl:value-of select="concat('_id:',MedlineCitation/PMID,',')"/>
<xsl:value-of select="concat('pmid:',MedlineCitation/PMID)"/>
<xsl:apply-templates select="MedlineCitation/DateCreated"/>
<xsl:apply-templates select="MedlineCitation/Article/ArticleTitle"/>
<xsl:apply-templates select="MedlineCitation/Article/Journal/JournalIssue/Issue"/>
<xsl:apply-templates select="MedlineCitation/Article/Journal/JournalIssue/Volume"/>
<xsl:apply-templates select="MedlineCitation/Article/Pagination/MedlinePgn"/>
<xsl:apply-templates select="MedlineCitation/Article/Journal"/>
<xsl:apply-templates select="PubmedData/ArticleIdList/ArticleId[@IdType='doi']"/>
<xsl:apply-templates select="PubmedData/ArticleIdList/ArticleId[@IdType='pmc']"/>
<xsl:apply-templates select="MedlineCitation/Article/Language"/>
<xsl:apply-templates select="MedlineCitation/Article/AuthorList"/>
<xsl:apply-templates select="MedlineCitation/MeshHeadingList"/>
<xsl:text>};
article=db.articles.save(article);

</xsl:text>

<xsl:text>

</xsl:text>
</xsl:template>




<xsl:template match="ArticleId[@IdType='doi']">
<xsl:text>,doi:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="ArticleId[@IdType='pmc']">
<xsl:text>,pmc:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="AuthorList">
<xsl:text>,authors:[</xsl:text>
<xsl:for-each select="Author">
<xsl:if test="position()!=1">,</xsl:if>
<xsl:apply-templates select="."/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:template>




<xsl:template match="MeshHeadingList">
<xsl:text>,mesh:[</xsl:text>
<xsl:for-each select="MeshHeading/DescriptorName">
<xsl:if test="position()!=1">,</xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:template>


<xsl:template match="Author">
<xsl:text>{firstName:"</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="ForeName"/>
</xsl:call-template>
<xsl:text>",lastName:"</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="LastName"/>
</xsl:call-template>
<xsl:text>"}</xsl:text>
</xsl:template>




<xsl:template match="DateCreated">
<xsl:text>,created:{</xsl:text>
<xsl:value-of select="concat('year:',number(Year),',')"/>
<xsl:value-of select="concat('month:',number(Month),',')"/>
<xsl:value-of select="concat('day:',number(Day))"/>
<xsl:text>}</xsl:text>
</xsl:template>


<xsl:template match="ArticleTitle">
<xsl:text>,title:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>


<xsl:template match="Issue">
<xsl:text>,issue:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="Volume">
<xsl:text>,volume:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="MedlinePgn">
<xsl:text>,pgn:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>



<xsl:template match="Journal">
<xsl:text>,journal:{title:</xsl:text>
<xsl:apply-templates select="Title" mode="text"/>
<xsl:apply-templates select="ISOAbbreviation"/>
<xsl:apply-templates select="ISSN[@IssnType='Print']"/>
<xsl:text>}</xsl:text>
</xsl:template>

<xsl:template match="ISOAbbreviation">
<xsl:text>,abbr:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="ISSN">
<xsl:text>,issn:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>


<xsl:template match="Language">
<xsl:text>,lang:</xsl:text>
<xsl:apply-templates select="." mode="text"/>
</xsl:template>

<xsl:template match="*" mode="text">
<xsl:text>"</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="."/>
</xsl:call-template>
<xsl:text>"</xsl:text>
</xsl:template>

<xsl:template name="escape">
<xsl:param name="s"/>
<xsl:choose>
<xsl:when test="contains($s,'&quot;')">
<xsl:value-of select="substring-before($s,'&quot;')"/>
<xsl:text>\&quot;</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="substring-after($s,'&quot;')"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$s"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>




</xsl:stylesheet>


Transform pubmed/xml to mongo:
xsltproc pubmed2mongo.xsl ~/pubmed_result.txt > input.js

> more input.js

db.articles.drop();

article={_id:20665232,pmid:20665232,created:{year:2010,month:8,day:10},title:"Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth amp; death of species\".",issue:"2",volume:"43",pgn:"363-99",journal:{title:"Journal of the history of biology",abbr:"J Hist Biol",issn:"0022-5010"},doi:"10.1007/s10739-009-9189-9",lang:"eng",authors:[{firstName:"Paul D",lastName:"Brinkman"}],mesh:["Animals","Fossils","History, 19th Century","Natural History","Phylogeny","Vertebrates"]};
article=db.articles.save(article);




article={_id:20626121,pmid:20626121,created:{year:2010,month:7,day:14},title:"[The biomedical legacy of Charles Darwin]",issue:"2",volume:"146",pgn:"87-9",journal:{title:"Gaceta médica de México",abbr:"Gac Med Mex",issn:"0016-3813"},lang:"spa",authors:[{firstName:"Emilio",lastName:"García-Procel"}],mesh:["Biology","Evolution","History, 19th Century","History, 20th Century","Medicine"]};
article=db.articles.save(article);




article={_id:20503821,pmid:20503821,created:{year:2010,month:5,day:27},title:"Darwin and the popularization of evolution.",issue:"1",volume:"64",pgn:"5-24",journal:{title:"Notes and records of the Royal Society of London",abbr:"Notes Rec R Soc Lond",issn:"0035-9149"},lang:"eng",authors:[{firstName:"Bernard",lastName:"Lightman"}],mesh:["Biology","Evolution","Genetic Fitness","History, 19th Century","History, 20th Century","Humans","Male","Philosophy","Religion","Science","Selection, Genetic","United States"]};
article=db.articles.save(article);

and load this file into mongo in the database 'pubmed'.
mongo pubmed input.js


and add a few indexes for this database:
>db.articles.ensureIndex({pmid:1}, {unique: true});
db.articles.ensureIndex({created:1});
db.articles.ensureIndex({authors:1});
db.articles.ensureIndex({mesh:1});
db.articles.ensureIndex({journal:1});


print 3 records

> db.articles.find().limit(3).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"created" : {
"year" : 2010,
"month" : 8,
"day" : 10
},
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\".",
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"journal" : {
"title" : "Journal of the history of biology",
"abbr" : "J Hist Biol",
"issn" : "0022-5010"
},
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul D",
"lastName" : "Brinkman"
}
],
"mesh" : [
"Animals",
"Fossils",
"History, 19th Century",
"Natural History",
"Phylogeny",
"Vertebrates"
]
}
{
"_id" : 20626121,
"pmid" : 20626121,
"created" : {
"year" : 2010,
"month" : 7,
"day" : 14
},
"title" : "[The biomedical legacy of Charles Darwin]",
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"journal" : {
"title" : "Gaceta médica de México",
"abbr" : "Gac Med Mex",
"issn" : "0016-3813"
},
"lang" : "spa",
"authors" : [
{
"firstName" : "Emilio",
"lastName" : "García-Procel"
}
],
"mesh" : [
"Biology",
"Evolution",
"History, 19th Century",
"History, 20th Century",
"Medicine"
]
}
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

skip 2 and print 1 record

> db.articles.find().skip(2).limit(1).forEach(printjson);
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

How many records in the database, regardless of the limit ?

> db.articles.find().limit(20).count();
327

How many records will be printed ?

> db.articles.find().limit(20).size();
20

Find one record with PMID=-1

> db.articles.findOne({pmid:-1});
null

Find PMID 20180452

> db.articles.findOne({pmid:20180452});
{
"_id" : 20180452,
"pmid" : 20180452,
"created" : {
"year" : 2010,
"month" : 2,
"day" : 25
},
"title" : "[Darwin's hidden feeling for emotions of the species]",
"issue" : "50-51",
"volume" : "106",
"pgn" : "3443-6",
"journal" : {
"title" : "Läkartidningen",
"abbr" : "Lakartidningen",
"issn" : "0023-7205"
},
"lang" : "swe",
"authors" : [
{
"firstName" : "Gösta",
"lastName" : "Alfvén"
}
],
"mesh" : [
"Animals",
"Emotions",
"England",
"Evolution",
"Famous Persons",
"History, 19th Century",
"Humans",
"Species Specificity"
]
}

Searching in Arrays: find records having a mesh equals to 'Lactose Intolerance'

> db.articles.find({mesh:'Lactose Intolerance'}).forEach(printjson);
{
"_id" : 17575947,
"pmid" : 17575947,
"created" : {
"year" : 2007,
"month" : 6,
"day" : 19
},
"title" : "Darwin's illness: a final diagnosis.",
"issue" : "1",
"volume" : "61",
"pgn" : "23-9",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Fernando",
"lastName" : "Orrego"
},
{
"firstName" : "Carlos",
"lastName" : "Quintana"
}
],
"mesh" : [
"Biology",
"Crohn Disease",
"England",
"History, 19th Century",
"Humans",
"Lactose Intolerance",
"Male"
]
}
{
"_id" : 15811889,
"pmid" : 15811889,
"created" : {
"year" : 2005,
"month" : 4,
"day" : 6
},
"title" : "Darwin's illness revealed.",
"issue" : "954",
"volume" : "81",
"pgn" : "248-51",
"journal" : {
"title" : "Postgraduate medical journal",
"abbr" : "Postgrad Med J",
"issn" : "0032-5473"
},
"doi" : "10.1136/pgmj.2004.025569",
"pmc" : "PMC1743237",
"lang" : "eng",
"authors" : [
{
"firstName" : "Anthony K",
"lastName" : "Campbell"
},
{
"firstName" : "Stephanie B",
"lastName" : "Matthews"
}
],
"mesh" : [
"Famous Persons",
"Great Britain",
"History, 19th Century",
"Lactose Intolerance"
]
}

Explain, find with an index (PMID)

> db.articles.find({pmid:20180452}).explain();
{
"cursor" : "BtreeCursor pmid_1",
"nscanned" : 1,
"nscannedObjects" : 1,
"n" : 1,
"millis" : 0,
"indexBounds" : {
"pmid" : [
[
20180452,
20180452
]
]
}
}

Explain, find without an index (volume)

> db.articles.find({volume:'81'}).explain();
{
"cursor" : "BasicCursor",
"nscanned" : 327,
"nscannedObjects" : 327,
"n" : 2,
"millis" : 0,
"indexBounds" : {

}
}

'AND operator' search for records having mesh and 'Evolution' AND 'Religion' AND 'History, 19th Century'

>db.articles.find({mesh:{$all:["Evolution","History, 19th Century","Religion"]}}).limit(2).forEach(printjson);

{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}
{
"_id" : 20145235,
"pmid" : 20145235,
"created" : {
"year" : 2010,
"month" : 2,
"day" : 10
},
"title" : "Darwin's compassionate view of human nature.",
"issue" : "6",
"volume" : "303",
"pgn" : "557-8",
"journal" : {
"title" : "JAMA : the journal of the American Medical Association",
"abbr" : "JAMA"
},
"doi" : "10.1001/jama.2010.101",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul",
"lastName" : "Ekman"
}
],
"mesh" : [
"Altruism",
"Animals",
"Behavior, Animal",
"Empathy",
"Evolution",
"Famous Persons",
"History, 19th Century",
"Humans",
"Religion",
"Social Values"
]
}

Search all, only return the name and the pmid, limit 5

> db.articles.find({},{"title":1,"pmid":1}).limit(5).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\"."
}
{
"_id" : 20626121,
"pmid" : 20626121,
"title" : "[The biomedical legacy of Charles Darwin]"
}
{
"_id" : 20503821,
"pmid" : 20503821,
"title" : "Darwin and the popularization of evolution."
}
{
"_id" : 20481191,
"pmid" : 20481191,
"title" : "Between the Beagle and the barnacle: Darwin's microscopy, 1837-1854."
}
{
"_id" : 20338536,
"pmid" : 20338536,
"title" : "Darwin as a student of behavior."
}

Search all, only returns the name and the date, sort by year, limit 5

> db.articles.find({},{"title":1,"created":1}).limit(5).sort({"created.year":1}).forEach(printjson);

{
"_id" : 20255988,
"created" : {
"year" : 1947,
"month" : 12,
"day" : 1
},
"title" : "Charles Darwin's life at Downe."
}
{
"_id" : 14840951,
"created" : {
"year" : 1951,
"month" : 12,
"day" : 1
},
"title" : "Some letters from Charles Darwin to Jeffries Wyman."
}
{
"_id" : 13110175,
"created" : {
"year" : 1954,
"month" : 12,
"day" : 1
},
"title" : "The life of the shawl."
}
{
"_id" : 13457284,
"created" : {
"year" : 1957,
"month" : 12,
"day" : 1
},
"title" : "[Charles Darwin; 1809-1882.]"
}
{
"_id" : 13377935,
"created" : {
"year" : 1957,
"month" : 12,
"day" : 1
},
"title" : "[Darwin and Freud; on Sigmund Freud's centenary.]"
}

Search all, omit some fields, limit 2

> db.articles.find({},{title:0,pmid:0,authors:0,created:0,mesh:0,journal:0}).limit(2).forEach(printjson);
{
"_id" : 20665232,
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng"
}
{
"_id" : 20626121,
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"lang" : "spa"
}

Sub documents: Search articles published in the 'Lancet',limit 2

> db.articles.find({"journal.title":"Lancet"}).limit(2).forEach(printjson);
{
"_id" : 19205083,
"pmid" : 19205083,
"created" : {
"year" : 2009,
"month" : 2,
"day" : 9
},
"title" : "What Darwin learned in medical school.",
"issue" : "9662",
"volume" : "373",
"pgn" : "454-5",
"journal" : {
"title" : "Lancet",
"abbr" : "Lancet"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Niles",
"lastName" : "Eldredge"
}
],
"mesh" : [
"Education, Medical",
"Educational Status",
"Evolution",
"History, 19th Century",
"Natural History",
"Schools, Medical",
"Scotland"
]
}
{
"_id" : 11597663,
"pmid" : 11597663,
"created" : {
"year" : 2001,
"month" : 10,
"day" : 12
},
"title" : "Darwin the philosopher?",
"issue" : "9288",
"volume" : "358",
"pgn" : "1118",
"journal" : {
"title" : "Lancet",
"abbr" : "Lancet",
"issn" : "0140-6736"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "J",
"lastName" : "Radcliffe Richards"
}
],
"mesh" : [
"History, 19th Century",
"Philosophy"
]
}

Only print the first author of each article

> db.articles.find({},{"authors":{$slice:1}}).limit(3).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"created" : {
"year" : 2010,
"month" : 8,
"day" : 10
},
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\".",
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"journal" : {
"title" : "Journal of the history of biology",
"abbr" : "J Hist Biol",
"issn" : "0022-5010"
},
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul D",
"lastName" : "Brinkman"
}
],
"mesh" : [
"Animals",
"Fossils",
"History, 19th Century",
"Natural History",
"Phylogeny",
"Vertebrates"
]
}
{
"_id" : 20626121,
"pmid" : 20626121,
"created" : {
"year" : 2010,
"month" : 7,
"day" : 14
},
"title" : "[The biomedical legacy of Charles Darwin]",
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"journal" : {
"title" : "Gaceta médica de México",
"abbr" : "Gac Med Mex",
"issn" : "0016-3813"
},
"lang" : "spa",
"authors" : [
{
"firstName" : "Emilio",
"lastName" : "García-Procel"
}
],
"mesh" : [
"Biology",
"Evolution",
"History, 19th Century",
"History, 20th Century",
"Medicine"
]
}
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

Only print the last author of each article

> db.articles.find({},{"authors":{$slice:-1}}).limit(3).forEach(printjson);
{
"_id" : 20665232,
"pmid" : 20665232,
"created" : {
"year" : 2010,
"month" : 8,
"day" : 10
},
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\".",
"issue" : "2",
"volume" : "43",
"pgn" : "363-99",
"journal" : {
"title" : "Journal of the history of biology",
"abbr" : "J Hist Biol",
"issn" : "0022-5010"
},
"doi" : "10.1007/s10739-009-9189-9",
"lang" : "eng",
"authors" : [
{
"firstName" : "Paul D",
"lastName" : "Brinkman"
}
],
"mesh" : [
"Animals",
"Fossils",
"History, 19th Century",
"Natural History",
"Phylogeny",
"Vertebrates"
]
}
{
"_id" : 20626121,
"pmid" : 20626121,
"created" : {
"year" : 2010,
"month" : 7,
"day" : 14
},
"title" : "[The biomedical legacy of Charles Darwin]",
"issue" : "2",
"volume" : "146",
"pgn" : "87-9",
"journal" : {
"title" : "Gaceta médica de México",
"abbr" : "Gac Med Mex",
"issn" : "0016-3813"
},
"lang" : "spa",
"authors" : [
{
"firstName" : "Emilio",
"lastName" : "García-Procel"
}
],
"mesh" : [
"Biology",
"Evolution",
"History, 19th Century",
"History, 20th Century",
"Medicine"
]
}
{
"_id" : 20503821,
"pmid" : 20503821,
"created" : {
"year" : 2010,
"month" : 5,
"day" : 27
},
"title" : "Darwin and the popularization of evolution.",
"issue" : "1",
"volume" : "64",
"pgn" : "5-24",
"journal" : {
"title" : "Notes and records of the Royal Society of London",
"abbr" : "Notes Rec R Soc Lond",
"issn" : "0035-9149"
},
"lang" : "eng",
"authors" : [
{
"firstName" : "Bernard",
"lastName" : "Lightman"
}
],
"mesh" : [
"Biology",
"Evolution",
"Genetic Fitness",
"History, 19th Century",
"History, 20th Century",
"Humans",
"Male",
"Philosophy",
"Religion",
"Science",
"Selection, Genetic",
"United States"
]
}

comparators: Print the title and the date for the articles published between February and March 2009.

> db.articles.find({"created.year":2009,"created.month":{$lt:4,$gt:1} },{title:1,created:1}).limit(2).forEach(printjson);

{
"_id" : 19283711,
"created" : {
"year" : 2009,
"month" : 3,
"day" : 17
},
"title" : "The day of immunology 2009."
}
{
"_id" : 19258529,
"created" : {
"year" : 2009,
"month" : 3,
"day" : 4
},
"title" : "MMBR to highlight microbial evolution, diversity, and ecology in 2009."
}

Print the title and the meshs for the articles NOT having ('Evolution' and 'Animals') in the mesh list.

> db.articles.find({mesh:{$nin:["Evolution","Animals"]} },{title:1,mesh:1}).limit(2).forEach(printjson);

{
"_id" : 20338529,
"title" : "The Darwin of pangenesis.",
"mesh" : [
"Cell Biology",
"History, 19th Century",
"Natural History",
"Physiology",
"Reproduction",
"Selection, Genetic"
]
}
{
"_id" : 20338527,
"title" : "Cross- and self-fertilization of plants.",
"mesh" : [
"Bibliography as Topic",
"Botany",
"Correspondence as Topic",
"Crosses, Genetic",
"Fertilization",
"Flowers",
"History, 19th Century",
"Orchidaceae",
"Plant Physiological Phenomena",
"Pollination"
]
}

Print the title and the authors for the articles having two authors, limit 2.

>db.articles.find({authors:{$size:2}} ,{title:1,authors:1}).limit(2).forEach(printjson);

{
"_id" : 20338526,
"title" : "Minute observations and theoretical framework of Darwin's studies on climbing plants.",
"authors" : [
{
"firstName" : "Jean-Marc",
"lastName" : "Drouin"
},
{
"firstName" : "Thierry",
"lastName" : "Deroin"
}
]
}
{
"_id" : 20338522,
"title" : "A non-Darwinian Darwin: An introduction.",
"authors" : [
{
"firstName" : "Jean",
"lastName" : "Gayon"
},
{
"firstName" : "Michel",
"lastName" : "Veuille"
}
]
}

Search for articles having a pmc-id.

> db.articles.find({pmc:{$exists:true}} ,{title:1,pmc:1}).limit(2).forEach(printjson);
{
"_id" : 19884139,
"title" : "Darwin's contributions to our understanding of emotional expressions.",
"pmc" : "PMC2781895"
}
{
"_id" : 19258529,
"title" : "MMBR to highlight microbial evolution, diversity, and ecology in 2009.",
"pmc" : "PMC2650884"
}

Search for articles lacking a pmc-id.

>db.articles.find({pmc:{$exists:false}} ,{title:1,pmc:1}).limit(2).forEach(printjson);
{
"_id" : 20665232,
"title" : "Charles Darwin's beagle voyage, fossil vertebrate succession, and \"the gradual birth & death of species\"."
}
{ "_id" : 20626121, "title" : "[The biomedical legacy of Charles Darwin]" }

Number of article where the title is not a string


> db.articles.find({title:{$not:{$type:2}}}).count();

0

OR operator:Articles published in 'Nature' or 'Lancet',limit 3

> db.articles.find({$or:[{"journal.title":"Lancet"},{"journal.title":"Nature"}]},{title:1,"journal.title":1}).limit(3).forEach(printjson);

{
"_id" : 19242459,
"title" : "Q&A: Getting under Darwin's skin. Interview by Adam Rutherford.",
"journal" : {
"title" : "Nature"
}
}
{
"_id" : 19205083,
"title" : "What Darwin learned in medical school.",
"journal" : {
"title" : "Lancet"
}
}
{
"_id" : 19020602,
"title" : "Birthdays to remember.",
"journal" : {
"title" : "Nature"
}
}

AND operator:Number of articles published in 'Nature' AND 'Lancet',limit 2

> db.articles.find({$and:[{"journal.title":"Lancet"},{"journal.title":"Nature"}]}).count();
0

REGEX operator:Search articles having a word starting with 'darwin'

> db.articles.find({title:/darwin[a-z]+/i },{title:1}).limit(5).forEach(printjson);

{
"_id" : 20338530,
"title" : "Sexual selection: Another Darwinian process."
}
{ "_id" : 20338522, "title" : "A non-Darwinian Darwin: An introduction." }
{
"_id" : 19784612,
"title" : "The predictability of evolution: glimpses into a post-Darwinian world."
}
{
"_id" : 19213802,
"title" : "Darwinian evolution in the light of genomics."
}
{
"_id" : 19203139,
"title" : "Darwinian theory, functionalism, and the first American psychological revolution."
}

Using javascript $where: articles starting with 'DARWIN'

> db.articles.find({$where:"this.title.substr(0,6)==\"DARWIN\""},{title:1}).limit(5).forEach(printjson);

{
"_id" : 14341734,
"title" : "DARWIN AS THE SOURCE OF FREUD'S NEO-LAMARCKIANISM."
}
{ "_id" : 14275525, "title" : "DARWIN'S ILLNESS." }
{
"_id" : 14248443,
"title" : "DARWIN'S HEALTH IN RELATION TO HIS VOYAGE TO SOUTH AMERICA."
}
{ "_id" : 14217140, "title" : "DARWIN'S ILLNESS." }

Array element by position: First author is Darwin

> db.articles.find({"authors.0.lastName":"Darwin"},{authors:1,title:1}).forEach(printjson);

{
"_id" : 11640659,
"title" : "[Not Available]",
"authors" : [
{
"firstName" : "C",
"lastName" : "Darwin"
}
]
}

Min/Max operator with indexed properties. Print PMID between [797714,382974[


>db.articles.find({},{pmid:1}).max({pmid:797714}).min({pmid:382974}).forEach(printjson);


{ "_id" : 382974, "pmid" : 382974 }
{ "_id" : 385078, "pmid" : 385078 }
{ "_id" : 387661, "pmid" : 387661 }
{ "_id" : 392346, "pmid" : 392346 }
{ "_id" : 395189, "pmid" : 395189 }
{ "_id" : 797709, "pmid" : 797709 }
{ "_id" : 797713, "pmid" : 797713 }

Map reduce: build a database containing all the distinct authors' names

> db.eval( function()
{
db.articles.find().forEach(function(o)
{
for(i in o.authors)
{
var s= o.authors[i].lastName;
db.names.save({name:s, _id:s});
}
}
);});
> db.names.find().limit(10).forEach(printjson);



{ "_id" : "Hodge", "name" : "Hodge" }
{ "_id" : "Deutsch", "name" : "Deutsch" }
{ "_id" : "Bellini", "name" : "Bellini" }
{ "_id" : "Torgerson", "name" : "Torgerson" }
{ "_id" : "Brinkman", "name" : "Brinkman" }
{ "_id" : "Alfvén", "name" : "Alfvén" }
{ "_id" : "Derry", "name" : "Derry" }
{ "_id" : "Fara", "name" : "Fara" }
{ "_id" : "Eldredge", "name" : "Eldredge" }
{ "_id" : "Buss", "name" : "Buss" }

Distinct operator: get all the distinct publication year

> db.articles.distinct("created.year")

[
1947,
1951,
1954,
1957,
1959,
1960,
1963,
1965,
1967,
1969,
1970,
1971,
1972,
1973,
1974,
1975,
1977,
1978,
1979,
1980,
1981,
1982,
1983,
1984,
1985,
1986,
1987,
1989,
1990,
1991,
1992,
1993,
1994,
1995,
1996,
1997,
1998,
1999,
2000,
2001,
2002,
2003,
2004,
2005,
2006,
2007,
2008,
2009,
2010
]

Distinct operator: get all the distinct publication years for the Journal 'Nature'

> db.articles.distinct("created.year",{"journal.title":"Nature"})

[ 1969, 1982, 1983, 2001, 2004, 2005, 2007, 2008, 2009 ]

GROUP operator: the number of articles per journal having mesh='Evolution' and having a number of articles greater than 2

> db.articles.group(
{
key:{},
cond:{mesh:"Evolution"},
initial:{journal:{},total:0},
reduce: function(object, aggregate)
{
var count=aggregate.journal[object.journal.title];
if(!count)
{
count=0;
}
count++;
aggregate.journal[object.journal.title]=count;
aggregate.total++;
},
finalize:function(aggregate)
{
for(j in aggregate.journal)
{
if( aggregate.journal[j]<3)
{
delete aggregate.journal[j];
}
}
}
})



[
{
"journal" : {
"Comptes rendus biologies" : 5,
"Läkartidningen" : 6,
"Isis; an international review devoted to the history of science and its cultural influences" : 6,
"Tidsskrift for den Norske lægeforening : tidsskrift for praktisk medicin, ny række" : 3,
"Endeavour" : 3,
"Journal of the history of biology" : 9,
"Die Naturwissenschaften" : 4,
"The American psychologist" : 7,
"Scientific American" : 3,
"Current biology : CB" : 5,
"Science (New York, N.Y.)" : 3,
"History and philosophy of the life sciences" : 8,
"Nature" : 10,
"Studies in history and philosophy of biological and biomedical sciences" : 5,
"Journal of the history of the behavioral sciences" : 5,
"Singapore medical journal" : 4,
"Rivista di biologia" : 4,
"Annals of science" : 3,
"Journal of the history of medicine and allied sciences" : 4
},
"total" : 207
}
]

Remove some fields for the journal published in Nature

> db.articles.update({"journal.title":"Nature"},{$unset:{title:1,authors:1,created:1,mesh:1}},false,true)
> db.articles.find({"journal.title":"Nature"}).limit(2).forEach(printjson);



{
"_id" : 19242459,
"doi" : "10.1038/4571087b",
"issue" : "7233",
"journal" : {
"title" : "Nature",
"abbr" : "Nature"
},
"lang" : "eng",
"pgn" : "1087",
"pmid" : 19242459,
"volume" : "457"
}
{
"_id" : 19020602,
"doi" : "10.1038/456324a",
"issue" : "7220",
"journal" : {
"title" : "Nature",
"abbr" : "Nature"
},
"lang" : "eng",
"pgn" : "324-5",
"pmid" : 19020602,
"volume" : "456"
}

Add another author for PMID:382974

> db.articles.update({"pmid":382974},{$push:{authors:{firstName:"John",lastName:"Doe"}}},false,true)
> db.articles.find({"pmid":382974},{authors:1}).forEach(printjson)


{
"_id" : 382974,
"authors" : [
{
"firstName" : "N",
"lastName" : "Freire-Maia"
},
{
"firstName" : "John",
"lastName" : "Doe"
}
]
}

Remove first author for PMID:382974

> db.articles.update({pmid:382974},{$pop:{authors:-1}},false,true)
> db.articles.find({pmid:382974},{authors:1}).forEach(printjson);


{
"_id" : 382974,
"authors" : [
{
"firstName" : "John",
"lastName" : "Doe"
}
]
}

Insert comment for PMID:382974

> db.articles.update({pmid:382974},{$set:{comment:"this is my comment"}})
> db.articles.find({pmid:382974},{comment:1}).forEach(printjson);


{ "_id" : 382974, "comment" : "this is my comment" }

add 2000000 to the year of publication for PMID:382974

> db.articles.update({pmid:382974},{$inc:{"created.year":2000000}},false)
> db.articles.find({pmid:382974},{created:1}).forEach(printjson);


{
"_id" : 382974,
"created" : {
"day" : 26,
"month" : 10,
"year" : 2001979
}
}


That's it.

Pierre