wamblee.org Git - utils/blob - trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java

   1 /*
   2  * Copyright 2006 the original author or authors.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package org.wamblee.crawler.kiss.scheduling;
  18
  19 import java.io.Serializable;
  20 import java.util.Calendar;
  21 import java.util.Date;
  22
  23 import org.apache.commons.logging.Log;
  24 import org.apache.commons.logging.LogFactory;
  25
  26 /**
  27  * This class encapsulates the logic for deciding whether to
  28  * run the crawler. This provides the mechanism to keep the
  29  * scheduler simple (e.g. scheduling every hour) and providing
  30  * more complex logic for determining whether to run the
  31  * crawler.
  32  */
  33 public class CrawlerSchedule implements Serializable {
  34
  35     private static final Log LOG = LogFactory.getLog(CrawlerSchedule.class);
  36
  37     private CrawlerExecutor _crawler;
  38     private Date _lastExecuted;
  39     private boolean _lastResult;
  40     private Exception _lastException;
  41     private int _hourMin;
  42     private int _hourMax;
  43
  44     /**
  45      * Constructs the scheduler.
  46      * @param aCrawler The interface by which the crawler is executed.
  47      * @param aHourMin The crawler may only run if hour &gt;= <code>aHourMin</code>
  48      * @param aHourMax The crawler may only run if hour &lt;= <code>aHourMax</code>
  49      */
  50     public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
  51         _crawler = aCrawler;
  52         _lastExecuted = null;
  53         _lastResult = false;
  54         _lastException = null;
  55         _hourMin = aHourMin;
  56         _hourMax = aHourMax;
  57     }
  58
  59     /**
  60      * Called by a scheduled job. This determines whether the crawler must be run or
  61      * not. This encapsulates the rukes for retrying and scheduling the crawler.
  62      * @param aDate Time at which we are executing now.
  63      */
  64     public void execute(Date aDate) {
  65         if (mustExecute(aDate)) {
  66             LOG.info("Executing crawler at " + aDate);
  67             try {
  68                 _crawler.execute(aDate);
  69                 _lastResult = true;
  70                 _lastException = null;
  71             } catch (Exception e) {
  72                 _lastResult = false;
  73                 _lastException = e;
  74             } finally {
  75                 _lastExecuted = aDate;
  76             }
  77         }
  78     }
  79
  80     /**
  81      * Gets the time the crawler was last executed.
  82      * @return Time of last execution.
  83      */
  84     public Date getLastExecuted() {
  85         return _lastExecuted;
  86     }
  87
  88     /**
  89      * Gets the result of the last execution.
  90      * @return True iff last execution was a success.
  91      */
  92     public boolean getLastResult() {
  93         return _lastResult;
  94     }
  95
  96     /**
  97      * Gets the exception thrown by the last execution.
  98      * @return null if the last execution was successful or an exception
  99      *   otherwise.
 100      */
 101     public Exception getLastException() {
 102         return _lastException;
 103     }
 104
 105     /**
 106      * Determines whether or not the crawler must be run.
 107      * @param aDate Current time.
 108      * @return True iff the crawler must be run.
 109      */
 110     private boolean mustExecute(Date aDate) {
 111         Calendar calendar = Calendar.getInstance();
 112         calendar.setTime(aDate);
 113         int hour = calendar.get(Calendar.HOUR_OF_DAY);
 114         if ( hour < _hourMin ) {
 115             return false;
 116         }
 117         if (hour > _hourMax ) {
 118             return false;
 119         }
 120
 121         if ( !lastExecutionWasOnSameDay(aDate)) {
 122             return true; // First execution of today.
 123         }
 124         // last execution was on the same day.
 125         if ( !_lastResult ) {
 126             return true; // last execution of today was unsuccessful, retry.
 127         }
 128         return false; // already run successfully today.
 129     }
 130
 131     private boolean lastExecutionWasOnSameDay(Date aDate) {
 132         if ( _lastExecuted == null ) {
 133             return false;
 134         }
 135         int curDay = getDayOfYear(aDate);
 136         int lastDay = getDayOfYear(_lastExecuted);
 137         return curDay == lastDay;
 138     }
 139
 140     /**
 141      * @param aDate
 142      */
 143     private int getDayOfYear(Date aDate) {
 144         Calendar calendar = Calendar.getInstance();
 145         calendar.setTime(aDate);
 146         return calendar.get(Calendar.DAY_OF_YEAR);
 147     }
 148 }