// See also: Orkut Friends Scraper
//
// See also: http://simile.mit.edu/wiki/Orkut_Friends_Scraper
//
// Collector is an abstraction for scraping
//
function Collector() {
this.uri = null;
this.outputNamespaces = {};
this.outboundVocabulary = {};
}
Collector.prototype = {
// Our focus of Attention in the source document
document: null,
currentElement: null,
currentElements: null,
// Our focus of attention in the model being built.
uri: null,
// Tools for picking apart the source document.
nsResolver: null,
setDoc: function(doc) {
this.document = doc;
this.currentElement = doc;
this.currentElements = null;
var namespace = doc.documentElement.namespaceURI;
this.nsResolver = null;
if (namespace) {
this.nsResolver = function(prefix) { return (prefix == 'x') ? namespace : null };
};
},
setElmts: function(xpath) {
this.currentElements =
utilities.gatherElementsOnXPath(this.document, this.currentElement, xpath, this.nsResolver);
},
mapElmts: function(start, step, func){
var len = this.currentElements.length;
for(var i = start; i < len; i += step) {
this.currentElement = this.currentElements[i];
//log(this.currentElements);
func(i,this.currentElement);
}
},
getNode: function(xpath){
return this.document.evaluate(xpath,
this.currentElement,
this.nsResolver,
XPathResult.ANY_TYPE,null).iterateNext();
},
// Tools for assembling the model.
outputNamespaces: {},
addNamespace: function(prefix, urlNamestring){
this.outputNamespaces[prefix] = urlNamestring;
},
outboundVocabulary: {},
expandName: function(name) {
var r = this.outboundVocabulary[name];
if (!r) {
var x = name.split(':');
r = (x.length != 2) ? name : (this.outputNamespaces[x[0]] + x[1]);
this.outboundVocabulary[name] = r;
};
return r;
},
assertText: function(propertyName, text) {
data.addStatement(this.uri,
this.expandName(propertyName),
text,
true);},
assertRelation: function(propertyName, uriText) {
data.addStatement(this.uri,
this.expandName(propertyName),
uriText,
false);},
assertNodeImage: function(propertyName, selector) {
try {
var ndQ = this.getNode(selector);
if (ndQ) {
data.addStatement(this.uri,
this.expandName(propertyName),
utilities.trimString(ndQ.src),
false);
};
} catch (e) {
log("Error seeking <"+this.uri+">'"+property+": "+e);
}
},
setURIFromAnchor: function(xpath, typeName) {
try{
this.uri = utilities.trimString(this.getNode(xpath).href);
} catch(e) {
log(e);
};
data.addStatement(this.uri,
this.expandName('rdf:type'),
this.expandName(typeName),
false);
}
};
//
// Now routines specific to the Orkut Friends page.
//
var c = new Collector();
c.addNamespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
c.addNamespace('rdfs', 'http://www.w3.org/2000/01/rdf-schema#');
c.addNamespace('dc', 'http://purl.org/dc/elements/1.1/');
c.addNamespace('foaf', 'http://xmlns.com/foaf/0.1/');
c.addNamespace('loc', 'http://simile.mit.edu/2005/05/ontologies/location#');
// function to collect one friend.
c.collectFriend = function (i) {
if(c.getNode('./TD[2]/A[1]')) {
c.setURIFromAnchor('./TD[2]/A[1]', 'foaf:Person');
c.assertRelation('rdfs:seeAlso', c.getNode('./TD[2]/A[1]').href);
var name = c.getNode('./TD[4]/A[1]/text()[1]').nodeValue;
c.assertText('dc:title',name);
c.assertText('foaf:name',name);
var address = c.getNode('./TD[4]/BR[1]').nextSibling.nodeValue;
c.assertText('loc:address', utilities.trimString(address));
var email = c.getNode('./TD[4]/BR[2]').nextSibling.nodeValue;
c.assertRelation('foaf:mbox', 'mailto:' + utilities.trimString(email));
c.assertNodeImage('foaf:depiction', './TD[2]/A[1]/IMG[1]');}};
// function to collect all friends on the page.
c.collectFriends = function (d) {
c.setDoc(d);
c.setElmts('//div[@id="friendtable"]/table[@class="friendtable"]/tbody/tr');
c.mapElmts(2,2,c.collectFriend);
var AQ = c.getNode('//tr[1]/td[@class="S"]/a[3]')
|| c.getNode('//tr[1]/td[@class="S"]/a[1]');
if(AQ && AQ.text != 'first'){
piggybank.scrapeURL(AQ.href,
c.collectFriends,
function(e){alert("Failed: "+e);});};};
//
// Finally, just do it!
//
c.collectFriends(document);
//

