(no commit message)

[utils] / crawler / kissweb / src / org / wamblee / crawler / kiss / scheduling / CrawlerSchedule.java
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java

index 09ada84da8d4f9bf0411ef1b6cb228a252ce4f27..5121f92058df76a437ca159e595c1b0e1f089866 100644 (file)
--- a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java
+++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java
@@ -20,6 +20,9 @@ import java.io.Serializable;
  import java.util.Calendar;
  import java.util.Date;
  
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
  /**
   * This class encapsulates the logic for deciding whether to
   * run the crawler. This provides the mechanism to keep the 
@@ -29,22 +32,32 @@ import java.util.Date;
   */
  public class CrawlerSchedule implements Serializable {
      
+    private static final Log LOG = LogFactory.getLog(CrawlerSchedule.class);
+    
      private CrawlerExecutor _crawler; 
      private Date _lastExecuted;
-    private Exception _lastResult;
+    private boolean _lastResult;
+    private Exception _lastException;
      private int _hourMin; 
      private int _hourMax;
      
      /**
       * Constructs the scheduler.
-     * @param aCrawler The interface by which the crawler is executed.
+     * The crawler will run if it is triggered in the range between the minimum (included)
+     * and maximum (included) hour of the day if either
+     * <ul>
+     *   <li>it is triggered for the first time on the current day.</li>
+     *   <li>an earlier crawling attempt on the same day failed. </li>
+     * </ul>
+     * @param aCrawler The interface through which the crawler is executed.
       * @param aHourMin The crawler may only run if hour &gt;= <code>aHourMin</code>
       * @param aHourMax The crawler may only run if hour &lt;= <code>aHourMax</code>
       */
      public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { 
          _crawler = aCrawler; 
-        _lastExecuted = null; 
-        _lastResult = null; 
+        _lastExecuted = new Date(); 
+        _lastResult = true; // the crawler will automatically run the next day. 
+        _lastException = null; 
          _hourMin = aHourMin;
          _hourMax = aHourMax;
      }
@@ -55,16 +68,20 @@ public class CrawlerSchedule implements Serializable {
       * @param aDate Time at which we are executing now. 
       */
      public void execute(Date aDate) { 
+        
          if (mustExecute(aDate)) { 
-            try { 
-                _lastResult = null; 
+            LOG.info("Executing crawler at " + aDate);
+            try {  
                  _crawler.execute(aDate);
+                _lastResult = true; 
+                _lastException = null; 
              } catch (Exception e) {
-                _lastResult = e; 
+                _lastResult = false; 
+                _lastException = e; 
              } finally { 
                  _lastExecuted = aDate;
              }
-        }
+        } 
      }
      
      /**
@@ -75,13 +92,25 @@ public class CrawlerSchedule implements Serializable {
          return _lastExecuted; 
      }
      
+    public void setLastExecuted(Date aDate) { 
+        _lastExecuted = aDate;
+    }
+    
      /**
       * Gets the result of the last execution. 
+     * @return True iff last execution was a success. 
+     */
+    public boolean getLastResult() { 
+        return _lastResult; 
+    }
+    
+    /**
+     * Gets the exception thrown by the last execution. 
       * @return null if the last execution was successful or an exception 
       *   otherwise.
       */
-    public Exception getLastResult() { 
-        return _lastResult; 
+    public Exception getLastException() { 
+        return _lastException; 
      }
      
      /**
@@ -90,22 +119,51 @@ public class CrawlerSchedule implements Serializable {
       * @return True iff the crawler must be run. 
       */
      private boolean mustExecute(Date aDate) {
+        if ( _lastExecuted == null ) {
+            return false; // crawler must be started manually at least once after deployment.
+        }
          Calendar calendar = Calendar.getInstance();
          calendar.setTime(aDate);
          int hour = calendar.get(Calendar.HOUR_OF_DAY);
          if ( hour < _hourMin ) {
-            return false;
+            return false; 
          }
          if (hour > _hourMax ) { 
              return false; 
          }
-        if ( hour == _hourMin ) { 
+     
+        if ( !lastExecutionWasOnSameDay(aDate)) { 
              return true; // First execution of today. 
-        }
-        if ( _lastResult != null ) { 
+        } 
+        // last execution was on the same day.
+        if ( !_lastResult ) { 
              return true; // last execution of today was unsuccessful, retry. 
          }
-        
          return false; // already run successfully today.
      }
+    
+    /**
+     * Determines if the last execution was on the same day. 
+     * @param aDate Current time. 
+     * @return True iff last execution was on the same day. 
+     */
+    private boolean lastExecutionWasOnSameDay(Date aDate) {
+        if ( _lastExecuted == null ) { 
+            return false;
+        }
+        int curDay = getDayOfYear(aDate);
+        int lastDay = getDayOfYear(_lastExecuted);
+        return curDay == lastDay; // check can be invalid only if scheduling interval is one year,
+                                  // which is ridiculous.
+    }
+
+    /**
+     * Gets the day of the year
+     * @param aDate Date to compute day for. 
+     */
+    private int getDayOfYear(Date aDate) {
+        Calendar calendar = Calendar.getInstance();
+        calendar.setTime(aDate);
+        return calendar.get(Calendar.DAY_OF_YEAR);
+    }
  }