//
Scraper Description
/*
* NSBI XML pubmed javascript scraper
*
* Author: eric miller <em@potlach.org>
*
* TODO:
*
* - model article <-> journal publication relationship
* - extract and assiociate relavant gene and protien information with article
* - enchance skos model of mesh terms
*/
var RDFNS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
var DCNS = "http://purl.org/dc/elements/1.1/";
var DCTNS = "http://purl.org/dc/terms/";
var FOAFNS = "http://xmlns.com/foaf/0.1/";
var SKOSNS = "http://www.w3.org/2004/02/skos/core#";
var PMNS = "http://example.com/vocab#";
var MESHNS = "http://www.nlm.nih.gov/mesh/vocab#";
var GENENS = "http://uniprot.org/gene/vocab#";
var meshroot = "http://www.nlm.nih.gov/mesh/term/";
var generoot = "http://purl.org/lsid/gene/";
var pubmedroot = "http://purl.org/lsid/ncbi/";
var journalroot = "http://purl.org/lsid/nlm/journal/";
var pbtool = "&tool=piggybank&email=em@potlach.org";
var getNode = function(document, contextNode, xpath) {
return document.evaluate(xpath, contextNode, null, XPathResult.ANY_TYPE,null).iterateNext();
}
var cleanString = function(s) {
return utilities.trimString(s);
}
var scrapeXMLRecord = function (document) {
log(document);
// this function scrapes the details page associated with a XML NCBI pubmed record
// article title
try {
var title_xpath = "//article/articletitle";
var title = utilities.gatherElementsOnXPath(document, document, title_xpath)[0].innerHTML;
data.addStatement(uri, DCNS + "title", cleanString(title), true);
} catch (e) {
log(e);
}
// article abstract
try {
var xpath = '//article/abstract/abstracttext';
var xvalue = utilities.gatherElementsOnXPath(document, document, xpath)[0].innerHTML;
data.addStatement(uri, DCNS + "description", cleanString(xvalue), true);
} catch (e) {
log(e);
}
// article affiliation
try {
var xpath = '//article/affiliation';
var xvalue = utilities.gatherElementsOnXPath(document, document, xpath)[0].innerHTML;
data.addStatement(uri, PMNS + "affiliation", cleanString(xvalue), true);
} catch (e) {
log(e);
}
// articles authors
try {
var authors_xpath = '//authorlist/author';
var authors = utilities.gatherElementsOnXPath(document, document, authors_xpath);
// log("num authors:" + authors.length);
for (var i = 0; i < authors.length; i++) {
var author = authors[i];
// get author lastname
try {
var lastName = cleanString(getNode(document, author, './lastname[1]/text()[1]').nodeValue);
} catch (e) {
log(e);
}
// get author firstname, some are forename some are firstname ?!?!?
try {
var firstName = cleanString(getNode(document, author, './forename[1]/text()[1]').nodeValue);
} catch (e) {
log(e);
}
if (firstName) {
try {
var firstName = cleanString(getNode(document, author, './FIRSTNAME[1]/text()[1]').nodeValue);
} catch (e) {
log(e);
}
}
// get authors initials
try {
var initials = cleanString(getNode(document, author, './INITIALS[1]/text()[1]').nodeValue);
} catch (e) {
log(e);
}
if (lastName) {
var aid = firstName + "_" + lastName;
var alabel = firstName + " " + lastName;
var creatoruri = pubmedroot + "pubmed/author/" + aid;
// add person
data.addStatement(creatoruri, RDFNS + "type", FOAFNS + "Person", false);
data.addStatement(creatoruri, FOAFNS + "lastName", lastName, true);
data.addStatement(creatoruri, FOAFNS + "firstName", firstName, true);
data.addStatement(creatoruri, RDFNS + "value", alabel, true);
// connect person to article
data.addStatement(uri, DCNS + "creator", creatoruri, false);
}
}
} catch (e) {
log(e);
}
// extract article's MESH subjects
try {
var subjects_xpath = '//meshheadinglist/meshheading';
var subjects = utilities.gatherElementsOnXPath(document, document, subjects_xpath);
for (var i = 0; i < subjects.length; i++) {
var subject = subjects[i];
try {
var descriptorName = cleanString(getNode(document, subject, './descriptorname[1]/text()[1]').nodeValue);
} catch (e) {
log(e);
}
if (descriptorName) {
var mtid = descriptorName.replace(/ /g, "_");
var meshtermuri = meshroot + mtid;
// add subject
data.addStatement(meshtermuri, RDFNS + "type", SKOSNS + "Concept", false);
data.addStatement(meshtermuri, SKOSNS + "prefLabel", descriptorName, true);
data.addStatement(meshtermuri, RDFNS + "value", descriptorName, true);
// connect subject to article
data.addStatement(uri, DCNS + "subject", meshtermuri, false);
}
}
} catch (e) {
log(e);
}
// extract journal information
try {
var journal_xpath = '//medlinejournalinfo/medlineta';
var journal_title = utilities.gatherElementsOnXPath(document, document, journal_xpath)[0].innerHTML;
var jid_xpath = '//medlinejournalinfo/nlmuniqueid';
var jid_value = utilities.gatherElementsOnXPath(document, document, jid_xpath)[0].innerHTML;
// at this point, we have enough to go on...
var journal_uri = journalroot + jid_value;
data.addStatement(journal_uri, RDFNS + "type", PMNS + "Journal", false);
data.addStatement(journal_uri, DCNS + "title", journal_title, true);
// connect article to journal (i don't think this is the right relationship however)
data.addStatement(uri, PMNS + "isPublishedIn", journal_uri, false);
} catch (e) {
log(e);
}
}
var getURLsToScrape = function(document) {
var urls = [];
var currentURL = document.location.href;
// record
// http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=15489161&query_hl=7
// pubmed also provides XML data via the eutils
// http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=11740561&retmode=xml
// http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=15489161&query_hl=7
// http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=15489161&retmode=xml
var pmid = currentURL.replace(/.*&list_uids=/, "").split(/&/)[0];
var uri = pubmedroot + "pmid/" + pmid;
var eutilsuri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + pmid + "&retmode=xml" + pbtool;
urls.push(eutilsuri);
return urls;
}
//=========================================================
var urls = getURLsToScrape(document);
for each (var url in urls) {
piggybank.scrapeURL(url, scrapeXMLRecord, alert);
}
//