Now crawling in desktop mode should work. It is much more efficient
authorerik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Wed, 23 Aug 2006 22:13:15 +0000 (22:13 +0000)
committererik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Wed, 23 Aug 2006 22:13:15 +0000 (22:13 +0000)
since it does not need to click on each and every program to obtain
program information.

Now, the crawler examines the next day instead of the current day and
it is scheduled to run between 19:00 and 24:00.

trunk/.classpath
trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java
trunk/crawler/kiss/src/channel-right-now-graphic.xsl
trunk/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java
trunk/crawler/kissweb/src/org.wamblee.crawler.kiss.xml

index af50c67ab76f07db1873d3edff2c51f18af74798..918f2c28c4f6fc96f2005afbeafa99c411320fad 100644 (file)
@@ -50,7 +50,7 @@
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/commons-email-1.0.jar"/>
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/commons-httpclient-3.0.jar"/>
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/commons-logging-1.0.2.jar"/>
-       <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/dom4j-1.6.jar"/>
+       <classpathentry sourcepath="/usr/java/dom4j/src/java" kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/dom4j-1.6.jar"/>
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/jaxen-1.1-beta-4.jar"/>
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/jstl-1.1.2.jar"/>
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/jtidy-4aug2000r7-dev.jar"/>
index da3fd2be6bca5511e4eecee6bde3bec562b8e77e..e88f4007a1dbe8d5b0c38e6b93ae816a482fbfb8 100644 (file)
@@ -61,7 +61,7 @@ public class ConfigurationParser {
 
     private static final int MAX_TRIES = 3;
 
-    private static final int MAX_DELAY = 5000;
+    private static final int MAX_DELAY = 10000;
     
     private XslTransformer _transformer;
 
index 43356172bacef0170467a717d32f4645b415baa1..6b5070c43425a4a2d4b2cba6a0a3aceb7db61023 100644 (file)
@@ -1,13 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-  xmlns:xhtml="http://www.w3.org/1999/xhtml"
-  version="1.0">
-  
+  xmlns:xhtml="http://www.w3.org/1999/xhtml" version="1.0">
+
   <xsl:output method="xml"/>
   <xsl:strip-space elements="xhtml:a"/>
-  
+
   <xsl:include href="utilities.xsl"/>
-  
+
   <!-- =====================================================
     Copying template.
     ===================================================== -->
       <xsl:apply-templates select="@*|node()" mode="copy"/>
     </xsl:copy>
   </xsl:template>
-  
+
   <xsl:template match="/">
     <xsl:element name="channel-right-now">
-      <xsl:apply-templates select="//xhtml:table[3]//xhtml:tr[xhtml:td[not(contains(@class, 'listCell'))]]"/>
+      <xsl:apply-templates
+        select="//xhtml:table[3]//xhtml:tr[xhtml:td[not(contains(@class, 'listCell'))]]"/>
     </xsl:element>
   </xsl:template>
-  
+
   <xsl:template match="xhtml:tr">
-     <xsl:element name="action">
-       <xsl:attribute name="name">
-         <xsl:call-template name="string-replace">
-           <xsl:with-param name="src" select="xhtml:td[3]//xhtml:a"/>
-           <xsl:with-param name="from" select="$newline"/>
-           <xsl:with-param name="to" select="''"/>
-         </xsl:call-template>
-       </xsl:attribute>
-       <xsl:attribute name="type">
-         <xsl:text>program-info</xsl:text>
-       </xsl:attribute>
-       <xsl:attribute name="reference">
-         <xsl:value-of select="xhtml:td[3]//xhtml:a/@href"/>
-       </xsl:attribute>
-       <xsl:element name="time">
-         <xsl:call-template name="string-replace">
-           <xsl:with-param name="src" select="xhtml:td[1]"/>
-           <xsl:with-param name="from" select="$newline"/>
-           <xsl:with-param name="to" select="''"/>
-         </xsl:call-template>
-       </xsl:element>
-       <xsl:apply-templates select=".//xhtml:script"/>
-     </xsl:element>
+    <xsl:element name="action">
+      <xsl:attribute name="name">
+        <xsl:call-template name="string-replace">
+          <xsl:with-param name="src" select="xhtml:td[3]//xhtml:a"/>
+          <xsl:with-param name="from" select="$newline"/>
+          <xsl:with-param name="to" select="''"/>
+        </xsl:call-template>
+      </xsl:attribute>
+      <xsl:attribute name="type">
+        <xsl:text>program-info</xsl:text>
+      </xsl:attribute>
+      <xsl:attribute name="reference">
+        <xsl:value-of select="xhtml:td[3]//xhtml:a/@href"/>
+      </xsl:attribute>
+      <xsl:element name="time">
+        <xsl:call-template name="string-replace">
+          <xsl:with-param name="src" select="xhtml:td[1]"/>
+          <xsl:with-param name="from" select="$newline"/>
+          <xsl:with-param name="to" select="''"/>
+        </xsl:call-template>
+      </xsl:element>
+      <xsl:apply-templates select=".//xhtml:script"/>
+    </xsl:element>
     <xsl:text>
       
     </xsl:text>
   </xsl:template>
-  
+
   <xsl:template match="xhtml:script">
     <xsl:variable name="script">
-      <xsl:value-of select="."/>   
+      <xsl:value-of select="."/>
     </xsl:variable>
     <xsl:variable name="description">
-      <xsl:value-of select="substring-before(substring-after($script, '&lt;br&gt;'), '&quot;]')"/> 
-    </xsl:variable>>
-    <xsl:if test="string-length($description) > 0 ">
-      <description>
-        <xsl:value-of select="$description"/>
-      </description>
-    </xsl:if>
-     
+      <xsl:value-of
+        select="substring-before(substring-after($script, '&lt;br&gt;'), '&quot;]')"/>
+    </xsl:variable>
+    <description>
+      <xsl:value-of select="$description"/>
+    </description>
+
   </xsl:template>
-  
+
 </xsl:stylesheet>
index 3191ff24e627cffdd5d0eab3321e106c090a3ba9..3300e1299e71b6c4268d9ba9447b65d6df17fe8f 100644 (file)
@@ -32,6 +32,7 @@ import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.NameValuePair;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.dom4j.Element;
 import org.wamblee.crawler.Action;
 import org.wamblee.crawler.Configuration;
 import org.wamblee.crawler.Crawler;
@@ -63,11 +64,11 @@ public class KissCrawler {
      * Start URL of the electronic programme guide.
      */
     private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php";
-    
+
     /**
-     * Default socket timeout to use. 
+     * Default socket timeout to use.
      */
-    private static final int SOCKET_TIMEOUT = 10000; 
+    private static final int SOCKET_TIMEOUT = 10000;
 
     /**
      * Regular expression for matching time interval strings in the retrieved
@@ -89,14 +90,15 @@ public class KissCrawler {
      *             In case of problems.
      */
     public static void main(String[] aArgs) throws Exception {
-        String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); 
-        String programConfig = new File(aArgs[1]).getCanonicalPath(); 
+        String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
+        String programConfig = new File(aArgs[1]).getCanonicalPath();
 
         BeanFactory factory = new StandaloneCrawlerBeanFactory();
         Notifier notifier = factory.find(Notifier.class);
-        new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig, notifier, new Report());
+        new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig,
+                programConfig, notifier, new Report());
     }
-    
+
     /**
      * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
      * EPG guide, filters the guide for interesting programs, tries to record
@@ -106,19 +108,24 @@ public class KissCrawler {
      *            Configuration file for the crawler.
      * @param aProgramConfig
      *            Configuration file describing interesting shows.
-     * @param aNotifier Object used to send notifications of the results.           
-     * @param aReport Report to use. 
+     * @param aNotifier
+     *            Object used to send notifications of the results.
+     * @param aReport
+     *            Report to use.
      * @throws IOException
      *             In case of problems reading files.
-     * @throws NotificationException In case notification fails.
-     * @throws PageException In case of problems retrieving the TV guide. 
+     * @throws NotificationException
+     *             In case notification fails.
+     * @throws PageException
+     *             In case of problems retrieving the TV guide.
      */
-    public KissCrawler(String aCrawlerConfig,
-            String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException {
-        this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, aNotifier, aReport);
+    public KissCrawler(String aCrawlerConfig, String aProgramConfig,
+            Notifier aNotifier, Report aReport) throws IOException,
+            NotificationException, PageException {
+        this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig,
+                aNotifier, aReport);
     }
 
-
     /**
      * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
      * EPG guide, filters the guide for interesting programs, tries to record
@@ -126,27 +133,35 @@ public class KissCrawler {
      * 
      * @param aStartUrl
      *            Start URL of the electronic programme guide.
-     * @param aSocketTimeout Socket timeout to use. 
+     * @param aSocketTimeout
+     *            Socket timeout to use.
      * @param aCrawlerConfig
      *            Configuration file for the crawler.
      * @param aProgramConfig
      *            Configuration file describing interesting shows.
-     * @param aNotifier Object used to send notifications of the results.           
-     * @param aReport Report to use. 
+     * @param aNotifier
+     *            Object used to send notifications of the results.
+     * @param aReport
+     *            Report to use.
      * @throws IOException
      *             In case of problems reading files.
-     * @throws NotificationException In case notification fails.
-     * @throws PageException In case of problems retrieving the TV guide.   
+     * @throws NotificationException
+     *             In case notification fails.
+     * @throws PageException
+     *             In case of problems retrieving the TV guide.
      */
-    public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig,
-            String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException {
+    public KissCrawler(String aStartUrl, int aSocketTimeout,
+            String aCrawlerConfig, String aProgramConfig, Notifier aNotifier,
+            Report aReport) throws IOException, NotificationException,
+            PageException {
 
         _pattern = Pattern.compile(TIME_REGEX);
 
         try {
             HttpClient client = new HttpClient();
             // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
-            client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT);
+            client.getParams().setParameter("http.socket.timeout",
+                    SOCKET_TIMEOUT);
 
             XslTransformer transformer = new XslTransformer(
                     new ClasspathUriResolver());
@@ -163,12 +178,11 @@ public class KissCrawler {
                 TVGuide guide = createGuide(page, aReport);
                 PrintVisitor printer = new PrintVisitor(System.out);
                 guide.accept(printer);
-                processResults(programFilters, guide, aNotifier,
-                        aReport);
+                processResults(programFilters, guide, aNotifier, aReport);
             } catch (PageException e) {
                 aReport.addMessage("Problem getting TV guide", e);
                 LOG.info("Problem getting TV guide", e);
-                throw e; 
+                throw e;
             }
             aNotifier.send(aReport.asXml());
         } finally {
@@ -260,12 +274,14 @@ public class KissCrawler {
      * @param aReport
      *            Report to use.
      * @return TV guide.
-     * @throws PageException In case of problem getting the tv guide.
+     * @throws PageException
+     *             In case of problem getting the tv guide.
      */
-    private TVGuide createGuide(Page aPage, Report aReport) throws PageException {
+    private TVGuide createGuide(Page aPage, Report aReport)
+            throws PageException {
         LOG.info("Obtaining full TV guide");
         Action[] actions = aPage.getActions();
-        if ( actions.length == 0 ) { 
+        if (actions.length == 0) {
             LOG.error("No channels found");
             throw new PageException("No channels found");
         }
@@ -273,13 +289,13 @@ public class KissCrawler {
         for (Action action : actions) {
             try {
                 LOG.info("Getting channel info for '" + action.getName() + "'");
-                Action rightNow = action.execute().getAction("right-now");
-                if (rightNow == null) {
+                Action tomorrow = action.execute().getAction("tomorrow");
+                if (tomorrow == null) {
                     throw new PageException("Channel summary page for '"
                             + action.getName()
                             + "' does not contain required information");
                 }
-                Channel channel = createChannel(action.getName(), rightNow
+                Channel channel = createChannel(action.getName(), tomorrow
                         .execute(), aReport);
                 channels.add(channel);
                 if (SystemProperties.isDebugMode()) {
@@ -319,18 +335,25 @@ public class KissCrawler {
                 TimeInterval interval = new TimeInterval(begin, end);
                 String description = "";
                 String keywords = "";
+
                 if (!SystemProperties.isNoProgramDetailsRequired()) {
-                    try {
-                        Page programInfo = action.execute();
-                        description = programInfo.getContent().element(
-                                "description").getText().trim();
-                        keywords = programInfo.getContent().element("keywords")
-                                .getText().trim();
-                    } catch (PageException e) {
-                        String msg =   "Program details could not be determined for '"
-                            + action.getName() + "'";
-                        aReport.addMessage(msg, e);
-                        LOG.warn(msg, e);
+                    Element descriptionElem = action.getContent().element(
+                            "description");
+                    if (descriptionElem == null) {
+                        try {
+                            Page programInfo = action.execute();
+                            description = programInfo.getContent().element(
+                                    "description").getText().trim();
+                            keywords = programInfo.getContent().element(
+                                    "keywords").getText().trim();
+                        } catch (PageException e) {
+                            String msg = "Program details could not be determined for '"
+                                    + action.getName() + "'";
+                            aReport.addMessage(msg, e);
+                            LOG.warn(msg, e);
+                        }
+                    } else {
+                        description = descriptionElem.getTextTrim();
                     }
                 }
                 Program program = new Program(aChannel, action.getName(),
index 6f0e2c96202e4f85f2db11a2efcb9af3d6527a0a..022b4aa0a92ceb82aff67b9fad38a80118940d9a 100644 (file)
@@ -24,7 +24,7 @@
       <constructor-arg><ref local="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"/></constructor-arg>
       <!-- The interval of the day in hours [hourmin, hourmax] over which crawling will be done and 
            retried if necessary --> 
-      <constructor-arg><value type="int">5</value></constructor-arg>  
+      <constructor-arg><value type="int">19</value></constructor-arg>  
       <constructor-arg><value type="int">24</value></constructor-arg>  
   </bean>