// See the scraper information
//
const rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
const dc = "http://purl.org/dc/elements/1.1/";
const bibtex = "http://www.ontoweb.org/ontology/1#";
var monthNames = [
"jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", "dec"
];
function makeDate(month, year) {
month = month.toLowerCase();
for (i = 0; i < monthNames.length; i++) {
if (month.indexOf(monthNames[i]) == 0) {
break;
}
}
if (i < monthNames.length) {
i++;
} else {
i = 1;
}
return year + "-" + (i < 10 ? "0" + i : i) + "-00T00:00:00Z";
}
function scrapeAnEntry(document, trElement, divElement) {
var authors = utilities.trimString(divElement.innerHTML);
var aElements = document.evaluate("a", divElement.parentNode, null, XPathResult.ANY_TYPE,null);
var aElement = aElements.iterateNext();
while (aElement) {
if (aElement.href.indexOf("citation.cfm?id=") >= 0) {
var uri = aElement.href;
var title = utilities.trimString(aElement.innerHTML);
break;
}
aElement = aElements.iterateNext();
}
var tableElement = divElement.parentNode.parentNode.parentNode;
try {
var date = utilities.trimString(tableElement.rows.item(1).cells.item(0).innerHTML);
} catch (e) {
log(e);
}
try {
var addinfo = utilities.trimString(
document.evaluate(".//strong", tableElement.rows.item(1).cells.item(2), null, XPathResult.ANY_TYPE,null)
.iterateNext().innerHTML);
} catch (e) {
log(e);
}
try {
var abstrakt = utilities.trimString(
document.evaluate("./td/div", tableElement.rows.item(3), null, XPathResult.ANY_TYPE,null)
.iterateNext().innerHTML);
} catch (e) {
log(e);
}
try {
var img = document.evaluate(".//img", trElement.cells.item(2), null, XPathResult.ANY_TYPE,null).iterateNext();
var relevanceURL = img.src;
var slash = relevanceURL.lastIndexOf("/");
var dot = relevanceURL.lastIndexOf(".");
var relevance = relevanceURL.substring(slash + 1, dot);
} catch (e) {
log(e);
}
//log(uri + "\n" + title + "\n" + authors + "\n" + date + "\n" + addinfo + "\n" + abstrakt);
if (uri) {
data.addStatement(uri, rdf + "type", bibtex + "Publication", false);
data.addStatement(uri, bibtex + "link", uri, true);
if (title) {
data.addStatement(uri, dc + "title", title, true);
}
if (authors) {
var a = authors.split(/, */);
for (j = 0; j < a.length; j++) {
data.addStatement(uri, dc + "author", a[j], true);
}
}
if (date) {
var space = date.indexOf(" ");
data.addStatement(uri, dc + "date",
makeDate(date.substr(0, space), date.substr(space + 1)), true);
}
if (addinfo) {
data.addStatement(uri, bibtex + "details", addinfo, true);
}
if (abstrakt) {
data.addStatement(uri, bibtex + "abstract", abstrakt, true);
}
if (relevance) {
data.addStatement(uri, bibtex + "relevance", relevance, true);
}
}
}
function scrapePage(document) {
var divElements = document.evaluate("//div", document, null, XPathResult.ANY_TYPE,null);
var divElement = divElements.iterateNext();
while (divElement) {
if (divElement.className == "authors") {
scrapeAnEntry(
document,
divElement.parentNode.parentNode.parentNode.parentNode.parentNode.parentNode,
divElement);
}
divElement = divElements.iterateNext();
}
}
function getPagesToScrape() {
var urls = [];
var addedURLs = [];
var aElements = document.evaluate("//td/a", document, null, XPathResult.ANY_TYPE,null);
var aElement = aElements.iterateNext();
while (aElement) {
var href = aElement.href;
if (href.indexOf("results.cfm?query=") >= 0 && !(addedURLs[href])) {
urls.push(href);
addedURLs[href] = true;
}
aElement = aElements.iterateNext();
}
return urls;
}
var urls = getPagesToScrape();
for each (var url in urls) {
piggybank.scrapeURL(url, scrapePage);
}
//

