wamblee.org Git - utils/blob - trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java

   1 package org.wamblee.crawler.impl;
   2
   3 import java.io.File;
   4 import java.io.FileInputStream;
   5 import java.io.FileOutputStream;
   6 import java.io.InputStream;
   7 import java.io.PrintStream;
   8
   9 import org.apache.commons.httpclient.HttpClient;
  10 import org.apache.commons.logging.Log;
  11 import org.apache.commons.logging.LogFactory;
  12 import org.dom4j.Element;
  13 import org.wamblee.crawler.Action;
  14 import org.wamblee.crawler.Configuration;
  15 import org.wamblee.crawler.Crawler;
  16 import org.wamblee.crawler.Page;
  17 import org.wamblee.crawler.PageException;
  18
  19 /*
  20  * Copyright 2005 the original author or authors.
  21  *
  22  * Licensed under the Apache License, Version 2.0 (the "License");
  23  * you may not use this file except in compliance with the License.
  24  * You may obtain a copy of the License at
  25  *
  26  *      http://www.apache.org/licenses/LICENSE-2.0
  27  *
  28  * Unless required by applicable law or agreed to in writing, software
  29  * distributed under the License is distributed on an "AS IS" BASIS,
  30  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  31  * See the License for the specific language governing permissions and
  32  * limitations under the License.
  33  */
  34
  35 /**
  36  * Entry point for the crawler.
  37  */
  38 public final class App {
  39
  40     /**
  41      * Disabled constructor.
  42      *
  43      */
  44     private App() {
  45         // Empty
  46     }
  47
  48     private static final Log LOG = LogFactory.getLog(App.class);
  49
  50     private static final String LOG_FILE = "crawler.log";
  51
  52     /**
  53      * Runs a test program.
  54      * @param aArgs Arguments. First argument is the crawler config file name and second argument is
  55      *      the start url.
  56      * @throws Exception In case of problems.
  57      */
  58     public static void main(String[] aArgs) throws Exception {
  59         String configFileName = aArgs[0];
  60         String starturl = aArgs[1];
  61
  62         FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
  63         PrintStream os = new PrintStream(fos);
  64
  65         try {
  66             ConfigurationParser parser = new ConfigurationParser(os);
  67             InputStream configFile = new FileInputStream(new File(
  68                     configFileName));
  69             Configuration config = parser.parse(configFile);
  70
  71             HttpClient client = new HttpClient();
  72             // client.getHostConfiguration().setProxy("localhost", 3128);
  73
  74             Crawler crawler = new CrawlerImpl(client, config);
  75
  76             System.out.println("Retrieving: " + starturl);
  77             Page page = crawler.getPage(starturl);
  78             showPage(page);
  79             page = page.getAction("channels-favorites").execute();
  80             recordInterestingShows(page);
  81             showPage(page);
  82             page = page.getAction("Nederland 1").execute();
  83             showPage(page);
  84             page = page.getAction("right-now").execute();
  85             showPage(page);
  86             page = page.getAction("Het elfde uur").execute();
  87             showPage(page);
  88         } finally {
  89             os.flush();
  90             os.close();
  91             System.out.println("Output written on '" + LOG_FILE + "'");
  92         }
  93     }
  94
  95     /**
  96      * @param starturl
  97      * @param crawler
  98      */
  99     private static void showPage(Page aPage) {
 100         Action[] links = aPage.getActions();
 101         for (Action link : links) {
 102             System.out.println("Link found '" + link.getName() + "'");
 103         }
 104         Element element = aPage.getContent();
 105         System.out.println("Retrieved content: " + element.asXML());
 106     }
 107
 108     private static void recordInterestingShows(Page page) throws PageException {
 109         Action[] channels = page.getActions();
 110         for (Action channel : channels) {
 111             examineChannel(channel.getName(), channel.execute().getAction(
 112                     "right-now").execute());
 113         }
 114     }
 115
 116     private static void examineChannel(String aChannel, Page aPage) throws PageException {
 117         Action[] programs = aPage.getActions();
 118         for (Action program : programs) {
 119             System.out.println(aChannel + " - " + program.getName());
 120             if (program.getName().toLowerCase().matches(".*babe.*")) {
 121                 Page programPage = program.execute();
 122                 Action record = programPage.getAction("record");
 123                 System.out.println("Recording possible: " + record != null);
 124             }
 125         }
 126     }
 127
 128 }