(no commit message)
authorErik Brakkee <erik@brakkee.org>
Thu, 16 Mar 2006 18:56:17 +0000 (18:56 +0000)
committerErik Brakkee <erik@brakkee.org>
Thu, 16 Mar 2006 18:56:17 +0000 (18:56 +0000)
13 files changed:
crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java
crawler/basic/src/org/wamblee/crawler/Action.java
crawler/basic/src/org/wamblee/crawler/Crawler.java
crawler/basic/src/org/wamblee/crawler/GetPageRequest.java
crawler/basic/src/org/wamblee/crawler/PageRequest.java
crawler/basic/src/org/wamblee/crawler/PostPageRequest.java
crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java
crawler/basic/src/org/wamblee/crawler/impl/App.java
crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java
crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java
crawler/kiss/build.xml
crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java
crawler/kiss/src/org/wamblee/crawler/kiss/Program.java

index 63764f53439d2c6ce836cb437ad2db100a4792f7..73132aa89736e8d06abd9a0e2cd1a310a4a96fa1 100644 (file)
@@ -23,6 +23,8 @@ import java.io.PrintStream;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
@@ -38,6 +40,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.w3c.dom.Document;
 import org.w3c.tidy.Tidy;
+import org.wamblee.io.FileResource;
 import org.wamblee.xml.XSLT;
 
 /**
@@ -47,6 +50,9 @@ public abstract class AbstractPageRequest implements PageRequest {
 
     private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class);
     private static final String REDIRECT_HEADER = "Location";
+   
+    private int _maxTries; 
+    private int _maxDelay; 
 
     private NameValuePair[] _params;
 
@@ -54,13 +60,15 @@ public abstract class AbstractPageRequest implements PageRequest {
     
     private PrintStream _os; 
 
-    protected AbstractPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+    protected AbstractPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) {
         if ( aParams == null ) { 
             throw new IllegalArgumentException("aParams is null");
         }
         if ( aXslt == null ) { 
             throw new IllegalArgumentException("aXslt is null");
         }
+        _maxTries = aMaxTries;
+        _maxDelay = aMaxDelay;
         _params = aParams;
         _xslt = aXslt;
         _os = aOs; 
@@ -76,8 +84,24 @@ public abstract class AbstractPageRequest implements PageRequest {
     protected NameValuePair[] getParameters() { 
         return _params;
     }
+    
+    protected Document executeMethod(HttpClient client, HttpMethod method) throws TransformerException { 
+        int triesLeft = _maxTries; 
+        while ( triesLeft > 0 ) {
+            triesLeft--;
+            try { 
+                return executeMethodWithoutRetries(client, method);
+            } catch (TransformerException e) { 
+                if ( triesLeft == 0 ) { 
+                    throw e;
+                }
+            }
+        }
+        throw new RuntimeException("Code should never reach this point");
+    }
+        
 
-    protected Document executeMethod(HttpClient client, HttpMethod method) {
+    protected Document executeMethodWithoutRetries(HttpClient client, HttpMethod method) throws TransformerException {
         try {
             // Execute the method.
             method = executeWithRedirects(client, method);
@@ -106,7 +130,7 @@ public abstract class AbstractPageRequest implements PageRequest {
             }
             xhtml.flush();
             byte[] xhtmlData = xhtml.toByteArray();
-            Document transformed = XSLT.transform(xhtmlData, new File(_xslt));
+            Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt)));
             _os.println("Transformed result is: ");
             Transformer transformer = TransformerFactory.newInstance().newTransformer();
             transformer.setParameter(OutputKeys.INDENT, "yes");
@@ -114,13 +138,26 @@ public abstract class AbstractPageRequest implements PageRequest {
             transformer.transform(new DOMSource(transformed), new StreamResult(_os));
             
             return transformed;
-        } catch (Exception e) {
+        } catch (HttpException e) { 
+            throw new RuntimeException(e.getMessage(), e); 
+        } catch (IOException e) { 
+            throw new RuntimeException(e.getMessage(), e);
+        } catch (TransformerConfigurationException e) { 
             throw new RuntimeException(e.getMessage(), e);
         } finally {
             // Release the connection.
             method.releaseConnection();
         }
     }
+    
+    private void delay() { 
+        try {
+            Thread.sleep((long)((float)_maxDelay* Math.random()));
+        } catch (InterruptedException e) { 
+            // 
+        }
+    }
+
 
     /**
      * @param aClient
@@ -129,6 +166,7 @@ public abstract class AbstractPageRequest implements PageRequest {
      * @throws HttpException
      */
     private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException {
+        delay();
         int statusCode = aClient.executeMethod(aMethod);
 
         switch (statusCode) { 
index a4df7a1fc736e0186236c61a86a63fdcf80cb3f6..0002814d506604f0277ea37163f608598570d2d3 100644 (file)
@@ -33,7 +33,7 @@ public interface Action {
      * Executes the action. 
      * @return
      */
-    Page execute();
+    Page execute() throws PageException;
     
     /**
      * Gets a description of the action. THe element returned is the action element
index f55eebb30afc1bed9b86c73d0e29eb6dddd80cbb..07dff3fdb5f7d6c42dfd3feb43ebc30101466c29 100644 (file)
@@ -27,7 +27,7 @@ public interface Crawler {
      * @param aUrl Url of page. 
      * @return Page to retrieve.
      */
-    Page getPage(String aUrl);
+    Page getPage(String aUrl) throws PageException;
     
     /**
      * Gets the content for a specific page. 
@@ -35,5 +35,5 @@ public interface Crawler {
      * @param aType Type of page.  
      * @return Page. 
      */
-    Page getPage(String aUrl, PageType aType); 
+    Page getPage(String aUrl, PageType aType) throws PageException
 }
index 7d99c1e814241c137026bdb88cdec01c65dc91e5..9a9d02e4ed2536f8619bc7f23dc13fac67255754 100644 (file)
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */ 
+ */
 
 package org.wamblee.crawler;
 
 import java.io.PrintStream;
 
+import javax.xml.transform.TransformerException;
+
 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.HttpMethod;
 import org.apache.commons.httpclient.NameValuePair;
@@ -25,35 +27,40 @@ import org.apache.commons.httpclient.methods.GetMethod;
 import org.w3c.dom.Document;
 
 /**
- * Gets a page by issueing a get request. 
+ * Gets a page by issueing a get request.
  */
 public class GetPageRequest extends AbstractPageRequest {
-    
-    public GetPageRequest(NameValuePair[] aParams, String aXslt) { 
-        super(aParams, aXslt, null);
+
+    public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
+        super(aMaxTries, aMaxDelay, aParams, aXslt, null);
     }
-    
-    public GetPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { 
-        super(aParams, aXslt, aOs);
+
+    public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+        super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
     }
-    
 
-    /* (non-Javadoc)
+    /*
+     * (non-Javadoc)
+     * 
      * @see org.wamblee.crawler.PageRequest#getPage(org.apache.commons.httpclient.HttpClient)
      */
-    public Document execute(String aUrl, HttpClient aClient) {
+    public Document execute(String aUrl, HttpClient aClient)
+            throws PageException {
         HttpMethod method = new GetMethod(aUrl);
-        if ( getParameters().length > 0 ) { 
+        if (getParameters().length > 0) {
             String oldQueryString = method.getQueryString();
             method.setQueryString(getParameters());
             String queryString = method.getQueryString();
-            if ( oldQueryString.length() > 0 ) {
+            if (oldQueryString.length() > 0) {
                 queryString = queryString + '&' + oldQueryString;
                 method.setQueryString(queryString);
             }
         }
-        
-        return executeMethod(aClient, method);
+        try {
+            return executeMethod(aClient, method);
+        } catch (TransformerException e) {
+            throw new PageException(e.getMessage(), e);
+        }
     }
 
 }
index cf88bbf878a756303a157f56632f8badb1837d7a..753bc05b7178fbc1fd0ede94a6e0b300ed734723 100644 (file)
@@ -29,7 +29,7 @@ public interface PageRequest {
      * @param aClient Http client to use. 
      * @return Client. 
      */
-     Document execute(String aUrl, HttpClient aClient);
+     Document execute(String aUrl, HttpClient aClient) throws PageException;
      
      /**
       * Overrides the Xslt to use. 
index 10ad783a76be7f3b8353818d4343094e0f885c1a..2257652204c2d706b1e17de2507dec95451496e6 100644 (file)
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */ 
+ */
 
 package org.wamblee.crawler;
 
 import java.io.PrintStream;
 
+import javax.xml.transform.TransformerException;
+
 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.NameValuePair;
 import org.apache.commons.httpclient.methods.PostMethod;
@@ -27,23 +29,31 @@ import org.w3c.dom.Document;
  * Retrieving pages using the post method.
  */
 public class PostPageRequest extends AbstractPageRequest {
-    
-    public PostPageRequest(NameValuePair[] aParams, String aXslt) { 
-        super(aParams, aXslt, null);
+
+    public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
+        super(aMaxTries, aMaxDelay, aParams, aXslt, null);
     }
-    
-    public PostPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { 
-        super(aParams, aXslt, aOs);
+
+    public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt,
+            PrintStream aOs) {
+        super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
     }
-    
-    
-    /* (non-Javadoc)
-     * @see org.wamblee.crawler.PageRequest#execute(java.lang.String, org.apache.commons.httpclient.HttpClient)
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.wamblee.crawler.PageRequest#execute(java.lang.String,
+     *      org.apache.commons.httpclient.HttpClient)
      */
-    public Document execute(String aUrl, HttpClient aClient) {
+    public Document execute(String aUrl, HttpClient aClient)
+            throws PageException {
         PostMethod method = new PostMethod(aUrl);
         method.addParameters(getParameters());
-        return executeMethod(aClient, method);
+        try {
+            return executeMethod(aClient, method);
+        } catch (TransformerException e) {
+            throw new PageException(e.getMessage(), e);
+        }
     }
 
 }
index d0fe0806d424dd6b1c74e3c64aa4621e6bfbc8f0..e5dac7d0a00dca03ec72f547d0728c0de7b4eea5 100644 (file)
@@ -20,6 +20,7 @@ import org.dom4j.Element;
 import org.wamblee.crawler.Action;
 import org.wamblee.crawler.Crawler;
 import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
 import org.wamblee.crawler.PageType;
 
 /**
@@ -59,7 +60,7 @@ public class ActionImpl implements Action {
     /* (non-Javadoc)
      * @see org.wamblee.crawler.Action#execute()
      */
-    public Page execute() {
+    public Page execute() throws PageException {
         if ( _type == null) {
             return _crawler.getPage(_reference);
         }
index 75fd3b09fa675e973cd64dc6c8f147246882e197..15e740a628df1b1a2d8d363755cc170235b493de 100644 (file)
@@ -7,11 +7,14 @@ import java.io.InputStream;
 import java.io.PrintStream;
 
 import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.dom4j.Element;
 import org.wamblee.crawler.Action;
 import org.wamblee.crawler.Configuration;
 import org.wamblee.crawler.Crawler;
 import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
 
 /*
  * Copyright 2005 the original author or authors.
@@ -34,12 +37,14 @@ import org.wamblee.crawler.Page;
  */
 public class App {
 
+    private static final Log LOG = LogFactory.getLog(App.class);
+
     private static final String LOG_FILE = "crawler.log";
 
     public static void main(String[] args) throws Exception {
         String configFileName = args[0];
         String starturl = args[1];
+
         FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
         PrintStream os = new PrintStream(fos);
 
@@ -51,7 +56,7 @@ public class App {
 
             HttpClient client = new HttpClient();
             // client.getHostConfiguration().setProxy("localhost", 3128);
-            
+
             Crawler crawler = new CrawlerImpl(client, config);
 
             System.out.println("Retrieving: " + starturl);
@@ -79,27 +84,28 @@ public class App {
      */
     private static void showPage(Page aPage) {
         Action[] links = aPage.getActions();
-        for (Action link: links) { 
+        for (Action link : links) {
             System.out.println("Link found '" + link.getName() + "'");
         }
-        Element element = aPage.getContent(); 
+        Element element = aPage.getContent();
         System.out.println("Retrieved content: " + element.asXML());
     }
-    
-    private static void recordInterestingShows(Page page) 
+
+    private static void recordInterestingShows(Page page) throws PageException {
         Action[] channels = page.getActions();
-        for (Action channel: channels) { 
-            examineChannel(channel.getName(), channel.execute().getAction("right-now").execute());
+        for (Action channel : channels) {
+            examineChannel(channel.getName(), channel.execute().getAction(
+                    "right-now").execute());
         }
     }
-    
-    private static void examineChannel(String aChannel, Page aPage) 
-        Action[] programs = aPage.getActions(); 
-        for (Action program: programs) { 
+
+    private static void examineChannel(String aChannel, Page aPage) throws PageException {
+        Action[] programs = aPage.getActions();
+        for (Action program : programs) {
             System.out.println(aChannel + " - " + program.getName());
-            if ( program.getName().toLowerCase().matches(".*babe.*")) { 
+            if (program.getName().toLowerCase().matches(".*babe.*")) {
                 Page programPage = program.execute();
-                Action record = programPage.getAction("record"); 
+                Action record = programPage.getAction("record");
                 System.out.println("Recording possible: " + record != null);
             }
         }
index 89e815c8c943375165d9a9c83a7b015ae0667139..dafbc832b5f12583a96f119f6e86953496661c7d 100644 (file)
@@ -49,6 +49,9 @@ public class ConfigurationParser {
     private static final String METHOD_POST = "post";
     private static final String METHOD_GET = "get";
     
+    private static final int MAX_TRIES = 3; 
+    private static final int MAX_DELAY = 5000;
+    
     private PrintStream _os; 
     
     public ConfigurationParser(PrintStream aOs) {
@@ -122,10 +125,10 @@ public class ConfigurationParser {
         NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]);
         PageRequest request; 
         if ( METHOD_POST.equals(method)) { 
-            request = new PostPageRequest(paramsArray, xslt, _os);
+            request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
         }
         else if ( METHOD_GET.equals(method) || method == null ){
-            request = new GetPageRequest(paramsArray, xslt, _os);
+            request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
         } else { 
             throw new RuntimeException("Unknown request method '" + method + "'. Only " + 
                     METHOD_GET + " and " + METHOD_POST + " are supported");
index 8db31606fc476e419cde8a4055a5286ad8f2c324..53a3873ab43e7e936f90ddbacf78464a74f73ada 100644 (file)
@@ -25,6 +25,7 @@ import org.w3c.dom.Document;
 import org.wamblee.crawler.Configuration;
 import org.wamblee.crawler.Crawler;
 import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
 import org.wamblee.crawler.PageRequest;
 import org.wamblee.crawler.PageType;
 
@@ -34,9 +35,10 @@ import org.wamblee.crawler.PageType;
 public class CrawlerImpl implements Crawler {
     
     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
+    private static final int MAX_DELAY = 5000; 
     
     private HttpClient _client; 
-    private Configuration _config; 
+    private Configuration _config;
     
     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
         _client = aClient; 
@@ -47,7 +49,7 @@ public class CrawlerImpl implements Crawler {
      *  (non-Javadoc)
      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
      */
-    public Page getPage(String aUrl) {
+    public Page getPage(String aUrl) throws PageException {
         LOG.info("Getting page: url = '" + aUrl + "'");
         PageRequest request = _config.getRequest(aUrl);
         Document content = request.execute(aUrl, _client);
@@ -57,13 +59,13 @@ public class CrawlerImpl implements Crawler {
     /* (non-Javadoc)
      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
      */
-    public Page getPage(String aUrl, PageType aType) {
+    public Page getPage(String aUrl, PageType aType) throws PageException {
         LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
         PageRequest request = _config.getRequest(aType);
         Document content = request.execute(aUrl, _client);
         return transformToDom4jDoc(content); 
     }
-
+    
     /**
      * @param aUrl
      * @param request
index 7e625bf4cb82e45c110beb24910f186c0dac187d..a2b58b4899949163efeee50ac2ece223e3d63f63 100644 (file)
@@ -17,7 +17,7 @@
    &header;
        
        <target name="module.build.deps" 
-         depends="logging.d,commons-codec.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d">
+         depends="logging.d,mail.d,commons-codec.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d">
        </target>
        
        <!-- Set libraries to use in addition for test, a library which 
index dd9ba78d1d9a16e07ce956e256ce6922eb3e0a20..298e94435a495c3357aaff8c1a5f7e7f56930e19 100644 (file)
@@ -18,24 +18,36 @@ package org.wamblee.crawler.kiss;
 
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.util.ArrayList;
+import java.util.Date;
 import java.util.List;
+import java.util.Properties;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import javax.mail.Message;
+import javax.mail.MessagingException;
+import javax.mail.Session;
+import javax.mail.Transport;
+import javax.mail.internet.AddressException;
+import javax.mail.internet.InternetAddress;
+import javax.mail.internet.MimeMessage;
+
 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.dom4j.Element;
 import org.wamblee.conditions.Condition;
-import org.wamblee.conditions.OrCondition;
 import org.wamblee.crawler.Action;
 import org.wamblee.crawler.Configuration;
 import org.wamblee.crawler.Crawler;
 import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
 import org.wamblee.crawler.impl.ConfigurationParser;
 import org.wamblee.crawler.impl.CrawlerImpl;
 
@@ -43,7 +55,7 @@ import org.wamblee.crawler.impl.CrawlerImpl;
  * 
  */
 public class KissCrawler {
-    
+
     private static final Log LOG = LogFactory.getLog(KissCrawler.class);
 
     private static final String LOG_FILE = "kiss.log";
@@ -51,14 +63,16 @@ public class KissCrawler {
     private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
 
     private static final String CRAWLER_CONFIG = "config.xml";
-    
+
     private static final String PROGRAM_CONFIG = "programs.xml";
 
     private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
 
     private Pattern _pattern;
 
-    public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws Exception {
+    public KissCrawler(String aStartUrl, String aCrawlerConfig,
+            String aProgramConfig) throws IOException, AddressException,
+            MessagingException {
 
         _pattern = Pattern.compile(TIME_REGEX);
 
@@ -66,33 +80,21 @@ public class KissCrawler {
         PrintStream os = new PrintStream(fos);
 
         try {
-            ConfigurationParser parser = new ConfigurationParser(os);
-            InputStream crawlerConfigFile = new FileInputStream(new File(aCrawlerConfig));
-            Configuration config = parser.parse(crawlerConfigFile);
-            
-            InputStream programConfigFile = new FileInputStream(new File(aProgramConfig)); 
-            Condition<Program> programCondition = new ProgramConfigurationParser().parse(programConfigFile); 
-           
-
             HttpClient client = new HttpClient();
             // client.getHostConfiguration().setProxy("localhost", 3128);
 
-            Crawler crawler = new CrawlerImpl(client, config);
+            Crawler crawler = createCrawler(aCrawlerConfig, os, client);
 
-            Page page = crawler.getPage(aStartUrl);
-            showPage(page);
-            page = page.getAction("channels-favorites").execute();
+            Page page = getStartPage(aStartUrl, crawler);
             TVGuide guide = createGuide(page);
             PrintVisitor printer = new PrintVisitor(System.out);
             guide.accept(printer);
-            
-            MatchVisitor matcher = new MatchVisitor(programCondition);
-            guide.accept(matcher);
-            List<Program> programs = matcher.getMatches(); 
-            for (Program program: programs) { 
-                System.out.println("Found: " + program + " record: " + program.record() );
-            }
-            
+
+            InputStream programConfigFile = new FileInputStream(new File(
+                    aProgramConfig));
+            Condition<Program> programCondition = new ProgramConfigurationParser()
+                    .parse(programConfigFile);
+            recordInterestingShows(programCondition, guide);
         } finally {
             os.flush();
             os.close();
@@ -100,6 +102,83 @@ public class KissCrawler {
         }
     }
 
+    /**
+     * @param programCondition
+     * @param guide
+     * @throws AddressException
+     * @throws MessagingException
+     */
+    private void recordInterestingShows(Condition<Program> programCondition,
+            TVGuide guide) throws AddressException, MessagingException {
+        MatchVisitor matcher = new MatchVisitor(programCondition);
+        guide.accept(matcher);
+        List<Program> programs = matcher.getMatches();
+        String recorded = "";
+        String notRecorded = "";
+        String failures = "";
+        for (Program program : programs) {
+            try {
+                boolean result = program.record();
+                if (result) {
+                    recorded += "\n" + program;
+                } else {
+                    notRecorded += "\n" + program;
+                }
+            } catch (PageException e) {
+                LOG.info("Attempt to record " + program + " failed.");
+                failures += "\n" + program.toString() + ": " + e.getMessage();
+            }
+        }
+        String msg = "Summary of KiSS crawler: \n\n\n";
+
+        if (recorded.length() > 0) {
+            msg += "Recorded programs:\n\n" + recorded + "\n\n";
+        }
+        if (notRecorded.length() > 0) {
+            msg += "Not recorded programs:\n\n" + notRecorded + "\n\n";
+        }
+        if (recorded.length() == 0 && notRecorded.length() == 0) {
+            msg += "No suitable programs found";
+        }
+        if (failures.length() > 0) {
+            msg += "Failures:\n\n" + failures;
+        }
+        System.out.println(msg);
+        sendMail(msg);
+    }
+
+    /**
+     * @param aCrawlerConfig
+     * @param os
+     * @param client
+     * @return
+     * @throws FileNotFoundException
+     */
+    private Crawler createCrawler(String aCrawlerConfig, PrintStream os,
+            HttpClient client) throws FileNotFoundException {
+        ConfigurationParser parser = new ConfigurationParser(os);
+        InputStream crawlerConfigFile = new FileInputStream(new File(
+                aCrawlerConfig));
+        Configuration config = parser.parse(crawlerConfigFile);
+        Crawler crawler = new CrawlerImpl(client, config);
+        return crawler;
+    }
+
+    /**
+     * @param aStartUrl
+     * @param crawler
+     * @return
+     */
+    private Page getStartPage(String aStartUrl, Crawler crawler) {
+        try {
+            Page page = crawler.getPage(aStartUrl);
+            return page.getAction("channels-favorites").execute();
+        } catch (PageException e) {
+            throw new RuntimeException(
+                    "Could not login to electronic program guide", e);
+        }
+    }
+
     public static void main(String[] args) throws Exception {
         new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
     }
@@ -118,9 +197,15 @@ public class KissCrawler {
         Action[] actions = page.getActions();
         List<Channel> channels = new ArrayList<Channel>();
         for (Action action : actions) {
-            Channel channel = createChannel(action.getName(), action.execute()
-                    .getAction("right-now").execute());
-            channels.add(channel);
+            try {
+                LOG.info("Getting channel info for '" + action.getName() + "'");
+                Channel channel = createChannel(action.getName(), action
+                        .execute().getAction("right-now").execute());
+                channels.add(channel);
+            } catch (PageException e) {
+                LOG.error("Could not create channel information for '"
+                        + action.getName() + "'", e);
+            }
         }
         return new TVGuide(channels);
     }
@@ -133,22 +218,44 @@ public class KissCrawler {
             String time = action.getContent().element("time").getText().trim();
             Matcher matcher = _pattern.matcher(time);
             if (matcher.matches()) {
-                Time begin = new Time(Integer.parseInt(matcher.group(1)), 
-                                      Integer.parseInt(matcher.group(2)));
-                Time end = new Time(Integer.parseInt(matcher.group(3)), 
-                        Integer.parseInt(matcher.group(4)));
+                Time begin = new Time(Integer.parseInt(matcher.group(1)),
+                        Integer.parseInt(matcher.group(2)));
+                Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
+                        .parseInt(matcher.group(4)));
                 TimeInterval interval = new TimeInterval(begin, end);
-                //Page programInfo = action.execute();
-                //String description = programInfo.getContent().element("description").getText().trim();
-                //String keywords = programInfo.getContent().element("keywords").getText().trim();
+                // Page programInfo = action.execute();
+                // String description =
+                // programInfo.getContent().element("description").getText().trim();
+                // String keywords =
+                // programInfo.getContent().element("keywords").getText().trim();
                 String description = "";
                 String keywords = "";
-                Program program = new Program(aChannel, action.getName(), description, keywords, interval, action);
-                
+                Program program = new Program(aChannel, action.getName(),
+                        description, keywords, interval, action);
+
                 LOG.debug("Got program " + program);
                 programs.add(program);
             }
         }
         return new Channel(aChannel, programs);
     }
+
+    private void sendMail(String aText) throws AddressException,
+            MessagingException {
+        Properties props = new Properties();
+        props.put("mail.transport.protocol", "smtp");
+        props.put("mail.smtp.host", "falcon");
+        props.put("mail.smtp.port", "25");
+
+        Session mailSession = Session.getInstance(props);
+        Message message = new MimeMessage(mailSession);
+
+        message.setFrom(new InternetAddress("erik@brakkee.org"));
+        message.setRecipient(Message.RecipientType.TO, new InternetAddress(
+                "erik@brakkee.org"));
+        message.setSentDate(new Date());
+        message.setSubject("KiSS crawler update");
+        message.setText(aText);
+        Transport.send(message);
+    }
 }
index 765ddfe1b2b8b590dacd1e168194d0adb20cab4f..2308fa982296ea76368d782043c2425d11af0486 100644 (file)
@@ -17,6 +17,7 @@
 package org.wamblee.crawler.kiss;
 
 import org.wamblee.crawler.Action;
+import org.wamblee.crawler.PageException;
 
 /**
  * 
@@ -62,7 +63,7 @@ public class Program {
         return _interval; 
     }
     
-    public boolean record() { 
+    public boolean record() throws PageException 
         Action record = _programInfo.execute().getAction(RECORD_ACTION); 
         if ( record == null) { 
             return false;