// Scraper Description

/*
 * NSBI XML pubmed javascript scraper
 *
 * Author: eric miller <em@potlach.org>
 *
 * TODO: 
 * 
 * - model article <-> journal publication relationship
 * - extract and assiociate relavant gene and protien information with article
 * - enchance skos model of mesh terms
 */

var RDFNS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
var DCNS = "http://purl.org/dc/elements/1.1/";
var DCTNS = "http://purl.org/dc/terms/";
var FOAFNS = "http://xmlns.com/foaf/0.1/";
var SKOSNS = "http://www.w3.org/2004/02/skos/core#";
var PMNS = "http://example.com/vocab#";
var MESHNS = "http://www.nlm.nih.gov/mesh/vocab#";
var GENENS = "http://uniprot.org/gene/vocab#";

var meshroot = "http://www.nlm.nih.gov/mesh/term/";
var generoot = "http://purl.org/lsid/gene/";
var pubmedroot = "http://purl.org/lsid/ncbi/";
var journalroot = "http://purl.org/lsid/nlm/journal/";

var pbtool = "&tool=piggybank&email=em@potlach.org";

var getNode = function(document, contextNode, xpath) {
    return document.evaluate(xpath, contextNode, null, XPathResult.ANY_TYPE,null).iterateNext();
}
    
var cleanString = function(s) {
    return utilities.trimString(s);
}

var scrapeXMLRecord = function (document) {
    
    log(document);
    
    // this function scrapes the details page associated with a XML NCBI pubmed record
    
    // article title 
    try {
        var title_xpath = "//article/articletitle";
        var title = utilities.gatherElementsOnXPath(document, document, title_xpath)[0].innerHTML;
        data.addStatement(uri, DCNS + "title", cleanString(title), true);
    } catch (e) {
        log(e);
    }

    // article abstract
    try {
        var xpath = '//article/abstract/abstracttext';
        var xvalue = utilities.gatherElementsOnXPath(document, document, xpath)[0].innerHTML;
        data.addStatement(uri, DCNS + "description", cleanString(xvalue), true);
    } catch (e) {
        log(e);
    }

    // article affiliation
    try {
        var xpath = '//article/affiliation';
        var xvalue = utilities.gatherElementsOnXPath(document, document, xpath)[0].innerHTML;
        data.addStatement(uri, PMNS + "affiliation", cleanString(xvalue), true);
    } catch (e) {
        log(e);
    }

    // articles authors
    try {
        var authors_xpath = '//authorlist/author';
        var authors = utilities.gatherElementsOnXPath(document, document, authors_xpath);
        
        // log("num authors:" + authors.length);
    
        for (var i = 0; i < authors.length; i++) {
    
            var author = authors[i];
    
            // get author lastname 
            try {
                var lastName = cleanString(getNode(document, author, './lastname[1]/text()[1]').nodeValue);
            } catch (e) { 
                log(e);
            }
    
            // get author firstname, some are forename some are firstname  ?!?!?
            try {
                var firstName = cleanString(getNode(document, author, './forename[1]/text()[1]').nodeValue);
            } catch (e) { 
                log(e);
            }
    
            if (firstName) {
                try {
                    var firstName = cleanString(getNode(document, author, './FIRSTNAME[1]/text()[1]').nodeValue);
                } catch (e) { 
                    log(e);
                }
            }
    
            // get authors initials
            try {
                var initials = cleanString(getNode(document, author, './INITIALS[1]/text()[1]').nodeValue);
            } catch (e) { 
                log(e);
            }
    
            if (lastName) {
                var aid = firstName + "_" + lastName;
                var alabel = firstName + " " + lastName;
                var creatoruri = pubmedroot + "pubmed/author/" + aid;
    
                // add person
                data.addStatement(creatoruri, RDFNS + "type", FOAFNS + "Person", false);
                data.addStatement(creatoruri, FOAFNS + "lastName", lastName, true);
                data.addStatement(creatoruri, FOAFNS + "firstName", firstName, true);
                data.addStatement(creatoruri, RDFNS + "value", alabel, true);
    
                // connect person to article
                data.addStatement(uri, DCNS + "creator", creatoruri,  false);
            }
        }
    } catch (e) {
        log(e);
    }

    // extract article's MESH subjects
    try {
        var subjects_xpath = '//meshheadinglist/meshheading';
        var subjects = utilities.gatherElementsOnXPath(document, document, subjects_xpath);
    
        for (var i = 0; i < subjects.length; i++) {
                
            var subject = subjects[i];
            
            try {
                var descriptorName = cleanString(getNode(document, subject, './descriptorname[1]/text()[1]').nodeValue);
            } catch (e) { 
                log(e);
            }
    
            if (descriptorName) {
                var mtid = descriptorName.replace(/ /g, "_");
                var meshtermuri = meshroot + mtid;
        
                // add subject
                data.addStatement(meshtermuri, RDFNS + "type", SKOSNS + "Concept", false);
                data.addStatement(meshtermuri, SKOSNS + "prefLabel", descriptorName, true);
                data.addStatement(meshtermuri, RDFNS + "value", descriptorName, true);
        
                // connect subject to article
                data.addStatement(uri, DCNS + "subject", meshtermuri,  false);
            }
        }
    } catch (e) {
        log(e);
    }

    // extract journal information
    try {
        var journal_xpath = '//medlinejournalinfo/medlineta';
        var journal_title = utilities.gatherElementsOnXPath(document, document, journal_xpath)[0].innerHTML;
    
        var jid_xpath = '//medlinejournalinfo/nlmuniqueid';
        var jid_value = utilities.gatherElementsOnXPath(document, document, jid_xpath)[0].innerHTML;

        // at this point, we have enough to go on... 
        var journal_uri = journalroot + jid_value;
        
        data.addStatement(journal_uri, RDFNS + "type", PMNS + "Journal",  false);
        data.addStatement(journal_uri, DCNS + "title", journal_title,  true);

        // connect article to journal (i don't think this is the right relationship however)
        data.addStatement(uri, PMNS + "isPublishedIn", journal_uri,  false);
    } catch (e) {
        log(e);
    }
}
    
var getURLsToScrape = function(document) {

    var urls = [];
    
    var currentURL = document.location.href;
    
    // record
    // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=15489161&query_hl=7
    
    // pubmed also provides XML data via the eutils
    // http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=11740561&retmode=xml

    // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=15489161&query_hl=7
    // http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=15489161&retmode=xml

    var pmid = currentURL.replace(/.*&list_uids=/, "").split(/&/)[0];    
    var uri = pubmedroot + "pmid/" + pmid;

    var eutilsuri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + pmid + "&retmode=xml" + pbtool; 

    urls.push(eutilsuri);

    return urls;
}


//=========================================================


var urls = getURLsToScrape(document);

for each (var url in urls) {
    piggybank.scrapeURL(url, scrapeXMLRecord, alert);
}

//