(no commit message)
authorerik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Tue, 25 Apr 2006 19:21:16 +0000 (19:21 +0000)
committererik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Tue, 25 Apr 2006 19:21:16 +0000 (19:21 +0000)
12 files changed:
trunk/.classpath
trunk/build/header.xml
trunk/crawler/basic/src/log4j.properties
trunk/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java
trunk/crawler/kissweb/WebRoot/WEB-INF/web.xml
trunk/crawler/kissweb/build.xml
trunk/crawler/kissweb/src/org.wamblee.beanfactory.properties
trunk/crawler/kissweb/src/org.wamblee.crawler.kiss.xml
trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java
trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java
trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java
trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java

index 67c79631196ff642c5140a808ed3b81d3443a4b8..14a8215642a43b22a8371f12df3117566d05c637 100644 (file)
@@ -61,5 +61,7 @@
        <classpathentry kind="lib" path="support/lib/external/spring-1.2.5.jar"/>
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/spring-1.2.5.jar"/>
        <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/quartz-1.5.1.jar"/>
+       <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/jstl-1.1.2.jar"/>
+       <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/standard-1.1.2.jar"/>
        <classpathentry kind="output" path="crawler/kissweb/WebRoot/WEB-INF/classes"/>
 </classpath>
index 29b6aa95ed2794b1a277f5389a716b8c809aa8fd..537f84f4719aa09518966bac4843ff6ff7782c43 100644 (file)
   </antcall>
 </target>
 
+<target name="jstl.d">
+  <antcall target="download.dep">
+    <param name="group" value="jstl"/>
+    <param name="version" value="1.1.2"/>
+  </antcall>
+  <antcall target="download.dep">
+    <param name="group" value="taglibs"/>
+       <param name="artifact" value="standard"/>
+    <param name="version" value="1.1.2"/>
+  </antcall>
+</target>
+
+
+
 <target name="quartz.d">
   <antcall target="download.dep">
     <param name="group" value="quartz"/>
index 652561490df3476ebabe327405f6966f07fc3d66..ab710b36ed32ca084c93ba8c64e3829a9ba092ac 100644 (file)
@@ -10,7 +10,7 @@
 log4j.rootLogger=ERROR, console
 
 # Log level for wamblee.org
-log4j.logger.org.wamblee=INFO
+log4j.logger.org.wamblee=DEBUG
 log4j.logger.org.wamblee.usermgt.UserAdministrationImplTest=INFO
 log4j.logger.org.wamblee.security.authorization=ERROR
 log4j.logger.org.wamblee.cache=INFO
index a9a8097a7ae038a92bda4ded8d158592bedcdc6a..969c5b23749e8e500f052308af0ed48a44f09308 100644 (file)
@@ -65,7 +65,7 @@ public class KissCrawler {
     /**
      * Default socket timeout to use. 
      */
-    private static final int SOCKET_TIMEOUT = 20000; 
+    private static final int SOCKET_TIMEOUT = 10000; 
 
     /**
      * Regular expression for matching time interval strings in the retrieved
@@ -107,7 +107,7 @@ public class KissCrawler {
      *             In case of problems sending a mail notification.
      */
     public KissCrawler(String aCrawlerConfig,
-            String aProgramConfig) throws IOException, NotificationException {
+            String aProgramConfig) throws IOException, NotificationException, PageException {
         this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig);
     }
 
@@ -130,7 +130,7 @@ public class KissCrawler {
      *             In case of problems sending a mail notification.
      */
     public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig,
-            String aProgramConfig) throws IOException, NotificationException {
+            String aProgramConfig) throws IOException, NotificationException, PageException {
 
         _pattern = Pattern.compile(TIME_REGEX);
 
@@ -162,6 +162,7 @@ public class KissCrawler {
             } catch (PageException e) {
                 report.addMessage("Problem getting TV guide", e);
                 LOG.info("Problem getting TV guide", e);
+                throw e; 
             }
             parser.getNotifier().send(report.asXml());
         } finally {
index 3239a34a073439cc0d8cb6b0bc6415bfa3e81efb..2ff8779fecd1540545696253e208c65b946cd3cb 100644 (file)
@@ -8,4 +8,14 @@
     <listener>
         <listener-class>org.wamblee.crawler.kiss.servlet.Application</listener-class>
     </listener>
+    
+    <servlet>
+        <servlet-name>CrawlerServlet</servlet-name>
+        <servlet-class>org.wamblee.crawler.kiss.servlet.CrawlerServlet</servlet-class>
+    </servlet>
+    
+    <servlet-mapping>
+        <servlet-name>CrawlerServlet</servlet-name>
+        <url-pattern>/</url-pattern>
+    </servlet-mapping>
 </web-app>
index aa400c7bf9c71b9a7aaf6d113eff7f4d0e689b30..d1987984c5f87aa73fe63be13953164ee6f10ad5 100644 (file)
@@ -22,7 +22,7 @@
    &kisscrawlerdeps;
        
        <target name="module.build.deps" 
-         depends="kisscrawler.src.d,servletapi.d,wamblee.kisscrawler.d,quartz.d,spring.d">
+         depends="kisscrawler.src.d,servletapi.d,wamblee.kisscrawler.d,quartz.d,spring.d,jstl.d">
        </target>
        
        <!-- Set libraries to use in addition for test, a library which 
index 563a09c9d1fdbae48fcff3318b735487f8f64541..73239e65fc6a5bc813bc7b055455ae4771796fdc 100644 (file)
@@ -1,6 +1,6 @@
 
 ##############################################################################
-# Class name of the beanfactory used by the photos application
+# Class name of the beanfactory used by the crawler application
 ##############################################################################
 
 org.wamblee.beanfactory.class=org.wamblee.crawler.kiss.spring.CrawlerBeanFactory
index 0137d940ae4ad577718a80d20ec9d75bd50a51af..01e88591439110d6a4486c1194fcde68775c03a7 100644 (file)
@@ -4,7 +4,7 @@
 <beans>
  
   <!-- The object that tells quartz how to schedule the crawler --> 
-  <bean id="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler"
+  <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerScheduler"
       class="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler">
       <constructor-arg><value type="int">3600</value></constructor-arg>
   </bean>   
   <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule"
       class="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule">
       <constructor-arg><ref local="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"/></constructor-arg>
-      <constructor-arg><value type="int">5</value></constructor-arg> <!-- from 5 AM --> 
-      <constructor-arg><value type="int">16</value></constructor-arg> <!-- to 4 PM --> 
+      <!-- The interval of the day in hours [hourmin, hourmax] over which crawling will be done and 
+           retried if necessary --> 
+      <constructor-arg><value type="int">5</value></constructor-arg>  
+      <constructor-arg><value type="int">24</value></constructor-arg>  
   </bean>
   
 </beans> 
\ No newline at end of file
index e07d90a68ab74cf40384729cff58903d99da8bec..428ba983cd059ff266e1a12d48fe13b70892a0bf 100644 (file)
@@ -25,5 +25,11 @@ import java.util.Date;
  *
  */
 public interface CrawlerExecutor { 
+    
+    /**
+     * Executes the crawler. 
+     * @param aDate Date the crawler is being triggered. 
+     * @throws Exception
+     */
     void execute(Date aDate) throws Exception;
 }
index e341169a82e76e5cad7da3f6cacda4b7cc84097b..5121f92058df76a437ca159e595c1b0e1f089866 100644 (file)
@@ -43,14 +43,20 @@ public class CrawlerSchedule implements Serializable {
     
     /**
      * Constructs the scheduler.
-     * @param aCrawler The interface by which the crawler is executed.
+     * The crawler will run if it is triggered in the range between the minimum (included)
+     * and maximum (included) hour of the day if either
+     * <ul>
+     *   <li>it is triggered for the first time on the current day.</li>
+     *   <li>an earlier crawling attempt on the same day failed. </li>
+     * </ul>
+     * @param aCrawler The interface through which the crawler is executed.
      * @param aHourMin The crawler may only run if hour &gt;= <code>aHourMin</code>
      * @param aHourMax The crawler may only run if hour &lt;= <code>aHourMax</code>
      */
     public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { 
         _crawler = aCrawler; 
-        _lastExecuted = null
-        _lastResult = false;
+        _lastExecuted = new Date()
+        _lastResult = true; // the crawler will automatically run the next day. 
         _lastException = null; 
         _hourMin = aHourMin;
         _hourMax = aHourMax;
@@ -62,6 +68,7 @@ public class CrawlerSchedule implements Serializable {
      * @param aDate Time at which we are executing now. 
      */
     public void execute(Date aDate) { 
+        
         if (mustExecute(aDate)) { 
             LOG.info("Executing crawler at " + aDate);
             try {  
@@ -85,6 +92,10 @@ public class CrawlerSchedule implements Serializable {
         return _lastExecuted; 
     }
     
+    public void setLastExecuted(Date aDate) { 
+        _lastExecuted = aDate;
+    }
+    
     /**
      * Gets the result of the last execution. 
      * @return True iff last execution was a success. 
@@ -108,11 +119,14 @@ public class CrawlerSchedule implements Serializable {
      * @return True iff the crawler must be run. 
      */
     private boolean mustExecute(Date aDate) {
+        if ( _lastExecuted == null ) {
+            return false; // crawler must be started manually at least once after deployment.
+        }
         Calendar calendar = Calendar.getInstance();
         calendar.setTime(aDate);
         int hour = calendar.get(Calendar.HOUR_OF_DAY);
         if ( hour < _hourMin ) {
-            return false;
+            return false; 
         }
         if (hour > _hourMax ) { 
             return false; 
@@ -128,17 +142,24 @@ public class CrawlerSchedule implements Serializable {
         return false; // already run successfully today.
     }
     
+    /**
+     * Determines if the last execution was on the same day. 
+     * @param aDate Current time. 
+     * @return True iff last execution was on the same day. 
+     */
     private boolean lastExecutionWasOnSameDay(Date aDate) {
         if ( _lastExecuted == null ) { 
             return false;
         }
         int curDay = getDayOfYear(aDate);
         int lastDay = getDayOfYear(_lastExecuted);
-        return curDay == lastDay; 
+        return curDay == lastDay; // check can be invalid only if scheduling interval is one year,
+                                  // which is ridiculous.
     }
 
     /**
-     * @param aDate
+     * Gets the day of the year
+     * @param aDate Date to compute day for. 
      */
     private int getDayOfYear(Date aDate) {
         Calendar calendar = Calendar.getInstance();
index 94201f37b23213d08c3b47b9a03379da456ae46a..5fa7cf86c4714a4f1acc158b643a1b3d3f305859 100644 (file)
 package org.wamblee.crawler.kiss.scheduling.quartz;
 
 import java.util.Date;
+import java.util.List;
 
 import org.quartz.JobDetail;
 import org.quartz.Scheduler;
 import org.quartz.SchedulerException;
 import org.quartz.SchedulerFactory;
+import org.quartz.SimpleTrigger;
 import org.quartz.Trigger;
 import org.quartz.TriggerUtils;
 import org.quartz.impl.StdSchedulerFactory;
+import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler;
 
 /**
  * Interface to the Quartz scheduler.
  */
-public class QuartzCrawlerScheduler {
+public class QuartzCrawlerScheduler implements CrawlerScheduler {
     
+    /**
+     * 
+     */
+    private static final String TRIGGER_NAME = "interval";
+
+    /**
+     * 
+     */
+    private static final String JOB_NAME = "kisscrawler";
+
     private Scheduler _scheduler; 
     
     private int _intervalInSeconds;
@@ -53,15 +66,32 @@ public class QuartzCrawlerScheduler {
     public void initialize() throws SchedulerException { 
         _scheduler.start();
 
-        JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class);
+        JobDetail jobDetail = new JobDetail(JOB_NAME, null, CrawlerJob.class);
         Trigger trigger = TriggerUtils.makeSecondlyTrigger(_intervalInSeconds);
         //trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date()));
         trigger.setStartTime(new Date());
-        trigger.setName("hourly");
+        trigger.setName(TRIGGER_NAME);
 
         _scheduler.scheduleJob(jobDetail, trigger);
     }
     
+    /* (non-Javadoc)
+     * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler#isCrawlerRunning()
+     */
+    public boolean isCrawlerRunning() throws Exception {
+        List jobs = _scheduler.getCurrentlyExecutingJobs(); 
+        return jobs.size() > 0; 
+    }
+    
+    /* (non-Javadoc)
+     * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler#scheduleNow()
+     */
+    public void scheduleNow() throws Exception {
+        Trigger trigger = new SimpleTrigger("immediate", null);
+        trigger.setJobName(JOB_NAME);
+        _scheduler.scheduleJob(trigger);
+    }
+    
     /**
      * Shuts down the scheduler. 
      * @throws SchedulerException
index d8be2bf7779e395f7b0a256ad8ba5e2200955223..19d3b6faf4f4f743684316e16925d81641c14438 100644 (file)
 
 package org.wamblee.crawler.kiss.servlet;
 
-import java.util.Date;
-
 import javax.servlet.ServletContextEvent;
 import javax.servlet.ServletContextListener;
 
-import org.quartz.JobDetail;
-import org.quartz.Scheduler;
 import org.quartz.SchedulerException;
-import org.quartz.SchedulerFactory;
-import org.quartz.Trigger;
-import org.quartz.TriggerUtils;
-import org.quartz.core.QuartzScheduler;
-import org.quartz.impl.StdSchedulerFactory;
-import org.wamblee.crawler.kiss.scheduling.quartz.CrawlerJob;
-import org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler;
+import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler;
 import org.wamblee.general.BeanKernel;
 
 /**
@@ -55,8 +45,9 @@ public class Application implements ServletContextListener {
         aEvent.getServletContext().log("KiSS Crawler initializing");
         try {
             getScheduler().initialize();
-        } catch (SchedulerException e) {
+        } catch (Exception e) {
             aEvent.getServletContext().log("Error scheduling job", e);
+            return; 
         }
         aEvent.getServletContext().log("KiSS Crawler initialized");
     }
@@ -70,8 +61,9 @@ public class Application implements ServletContextListener {
         aEvent.getServletContext().log("KiSS Crawler shutting down");
         try {
             getScheduler().shutdown();
-        } catch (SchedulerException e) {
+        } catch (Exception e) {
             aEvent.getServletContext().log("Error scheduling job", e);
+            return; 
         }
         aEvent.getServletContext().log("KiSS Crawler shut down complete");
     }
@@ -80,11 +72,11 @@ public class Application implements ServletContextListener {
      * Gets the scheduler from Spring. 
      * @return Scheduler. 
      */
-    private QuartzCrawlerScheduler getScheduler() { 
-        return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class);
+    private CrawlerScheduler getScheduler() { 
+        return BeanKernel.getBeanFactory().find(CrawlerScheduler.class);
     }
 
-    public static void main(String[] aArgs) throws SchedulerException {
+    public static void main(String[] aArgs) throws Exception {
         Application application = new Application();
         application.getScheduler().initialize();
     }