Changeset 9299

Show
Ignore:
Timestamp:
05/14/08 00:03:59 (2 months ago)
Author:
dfhuynh
Message:

Refactored and added index test that strips out URL queries first.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • collector/trunk/src/main/java/TestIndexedReferrers.java

    r9298 r9299  
    44import java.io.FileWriter; 
    55import java.io.IOException; 
    6 import java.io.InputStream; 
    7 import java.io.InputStreamReader; 
    86import java.io.LineNumberReader; 
    9 import java.io.Reader; 
    107import java.io.Writer; 
    11 import java.net.HttpURLConnection; 
    12 import java.net.URL; 
    138import java.text.ParseException; 
    149 
    15 import org.apache.commons.codec.net.URLCodec; 
    1610 
    1711 
     
    5549    static void processLine(int lineNo, String line, Writer writer) throws ParseException, IOException { 
    5650        String urlString = line.substring(1, line.length() - 1); 
    57         boolean googleIndexed = checkWithGoogle(urlString); 
    58         boolean yahooIndexed = checkWithYahoo(urlString); 
     51        boolean googleIndexed = IndexingUtilities.checkWithGoogle(urlString); 
     52        boolean yahooIndexed = IndexingUtilities.checkWithYahoo(urlString); 
    5953         
    6054        writer.write(urlString + "\t" + googleIndexed + "\t" + yahooIndexed + "\n"); 
    6155        System.out.println(lineNo + ". " + urlString + " - " + googleIndexed + ", " + yahooIndexed); 
    6256    } 
    63      
    64     static boolean checkWithGoogle(String urlString) { 
    65         String googleUrlString = "http://www.google.com/search?q=" + 
    66         encode("info:" + urlString); 
    67      
    68             String result = getContent(googleUrlString); 
    69             if (result.equals("")) { 
    70                 System.err.println("We're getting blocked by Google."); 
    71                 throw new InternalError(); 
    72             } 
    73             return result.length() > 0 && (result.indexOf("Google can show you the following information for") > 0); 
    74     } 
    75  
    76     static boolean checkWithYahoo(String urlString) { 
    77         String googleUrlString = "http://siteexplorer.search.yahoo.com/advsearch?ei=UTF-8&p=" + 
    78                 encode(urlString); 
    79      
    80             String result = getContent(googleUrlString); 
    81             if (result.equals("")) { 
    82                 System.err.println("We're getting blocked by Yahoo."); 
    83                 throw new InternalError(); 
    84             } 
    85             return result.length() > 0 && (result.indexOf("We were unable to find any results") < 0); 
    86     } 
    87  
    88     static char[] chars = new char[8192]; 
    89  
    90     static String getContent(String urlString) { 
    91         try { 
    92             URL url = new URL(urlString); 
    93              
    94             HttpURLConnection connection = (HttpURLConnection) url.openConnection(); 
    95             connection.setInstanceFollowRedirects(true); 
    96             connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14"); 
    97             connection.setRequestProperty("Accept", "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"); 
    98             connection.setRequestProperty("Accept-Language", "en-us,en;q=0.5"); 
    99             connection.setRequestProperty("Accept-Encoding", "deflate"); 
    100             connection.setRequestProperty("Accept-Charset", "utf-8;q=0.7"); 
    101             connection.setRequestProperty("Referer", "http://www.google.com/"); 
    102             connection.connect(); 
    103              
    104             if (connection.getResponseCode() == 200) { 
    105                     InputStream is = connection.getInputStream(); 
    106                     try { 
    107                         Reader reader = new InputStreamReader(is, "UTF-8"); 
    108                          
    109                         StringBuffer sb = new StringBuffer(); 
    110                          
    111                         int c = 0; 
    112                         while ((c = reader.read(chars)) > 0) { 
    113                             sb.append(chars, 0, c); 
    114                         } 
    115                          
    116                         return sb.toString(); 
    117                     } catch (Exception e) { 
    118                     } finally { 
    119                         is.close(); 
    120                     } 
    121             } 
    122         } catch (Exception e) { 
    123         } 
    124         return ""; 
    125     } 
    126      
    127     private static final String URL_ENCODING = "UTF-8"; 
    128     private static final URLCodec codec = new URLCodec(); 
    129  
    130     static public String encode(String s) { 
    131         try { 
    132             return codec.encode(s, URL_ENCODING); 
    133         } catch (Exception e) { 
    134             throw new RuntimeException("Exception encoding " + s + " with " + URL_ENCODING + " encoding."); 
    135         } 
    136     } 
    13757}