import java.util.Calendar;
import java.util.Date;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
/**
* This class encapsulates the logic for deciding whether to
* run the crawler. This provides the mechanism to keep the
*/
public class CrawlerSchedule implements Serializable {
+ private static final Log LOG = LogFactory.getLog(CrawlerSchedule.class);
+
private CrawlerExecutor _crawler;
private Date _lastExecuted;
- private Exception _lastResult;
+ private boolean _lastResult;
+ private Exception _lastException;
private int _hourMin;
private int _hourMax;
/**
* Constructs the scheduler.
- * @param aCrawler The interface by which the crawler is executed.
+ * The crawler will run if it is triggered in the range between the minimum (included)
+ * and maximum (included) hour of the day if either
+ * <ul>
+ * <li>it is triggered for the first time on the current day.</li>
+ * <li>an earlier crawling attempt on the same day failed. </li>
+ * </ul>
+ * @param aCrawler The interface through which the crawler is executed.
* @param aHourMin The crawler may only run if hour >= <code>aHourMin</code>
* @param aHourMax The crawler may only run if hour <= <code>aHourMax</code>
*/
public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
_crawler = aCrawler;
- _lastExecuted = null;
- _lastResult = null;
+ _lastExecuted = new Date();
+ _lastResult = true; // the crawler will automatically run the next day.
+ _lastException = null;
_hourMin = aHourMin;
_hourMax = aHourMax;
}
* @param aDate Time at which we are executing now.
*/
public void execute(Date aDate) {
+
if (mustExecute(aDate)) {
- try {
- _lastResult = null;
+ LOG.info("Executing crawler at " + aDate);
+ try {
_crawler.execute(aDate);
+ _lastResult = true;
+ _lastException = null;
} catch (Exception e) {
- _lastResult = e;
+ _lastResult = false;
+ _lastException = e;
} finally {
_lastExecuted = aDate;
}
- }
+ }
}
/**
return _lastExecuted;
}
+ public void setLastExecuted(Date aDate) {
+ _lastExecuted = aDate;
+ }
+
/**
* Gets the result of the last execution.
+ * @return True iff last execution was a success.
+ */
+ public boolean getLastResult() {
+ return _lastResult;
+ }
+
+ /**
+ * Gets the exception thrown by the last execution.
* @return null if the last execution was successful or an exception
* otherwise.
*/
- public Exception getLastResult() {
- return _lastResult;
+ public Exception getLastException() {
+ return _lastException;
}
/**
* @return True iff the crawler must be run.
*/
private boolean mustExecute(Date aDate) {
+ if ( _lastExecuted == null ) {
+ return false; // crawler must be started manually at least once after deployment.
+ }
Calendar calendar = Calendar.getInstance();
calendar.setTime(aDate);
int hour = calendar.get(Calendar.HOUR_OF_DAY);
if ( hour < _hourMin ) {
- return false;
+ return false;
}
if (hour > _hourMax ) {
return false;
}
- if ( hour == _hourMin ) {
+
+ if ( !lastExecutionWasOnSameDay(aDate)) {
return true; // First execution of today.
- }
- if ( _lastResult != null ) {
+ }
+ // last execution was on the same day.
+ if ( !_lastResult ) {
return true; // last execution of today was unsuccessful, retry.
}
-
return false; // already run successfully today.
}
+
+ /**
+ * Determines if the last execution was on the same day.
+ * @param aDate Current time.
+ * @return True iff last execution was on the same day.
+ */
+ private boolean lastExecutionWasOnSameDay(Date aDate) {
+ if ( _lastExecuted == null ) {
+ return false;
+ }
+ int curDay = getDayOfYear(aDate);
+ int lastDay = getDayOfYear(_lastExecuted);
+ return curDay == lastDay; // check can be invalid only if scheduling interval is one year,
+ // which is ridiculous.
+ }
+
+ /**
+ * Gets the day of the year
+ * @param aDate Date to compute day for.
+ */
+ private int getDayOfYear(Date aDate) {
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(aDate);
+ return calendar.get(Calendar.DAY_OF_YEAR);
+ }
}