--- /dev/null
+/*
+ * Copyright 2006 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.scheduling;
+
+import java.io.Serializable;
+import java.util.Calendar;
+import java.util.Date;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.wamblee.crawler.kiss.main.Report;
+
+/**
+ * This class encapsulates the logic for deciding whether to
+ * run the crawler. This provides the mechanism to keep the
+ * scheduler simple (e.g. scheduling every hour) and providing
+ * more complex logic for determining whether to run the
+ * crawler.
+ */
+public class CrawlerStatus implements Serializable {
+
+ private static final Log LOG = LogFactory.getLog(CrawlerStatus.class);
+
+ private CrawlerExecutor _crawler;
+ private Date _lastExecuted;
+ private boolean _lastResult;
+ private Exception _lastException;
+ private Report _lastReport;
+ private int _hourMin;
+ private int _hourMax;
+ private boolean _mustExecute;
+
+ /**
+ * Constructs the scheduler.
+ * The crawler will run if it is triggered in the range between the minimum (included)
+ * and maximum (included) hour of the day if either
+ * <ul>
+ * <li>it is triggered for the first time on the current day.</li>
+ * <li>an earlier crawling attempt on the same day failed. </li>
+ * </ul>
+ * @param aCrawler The interface through which the crawler is executed.
+ * @param aHourMin The crawler may only run if hour >= <code>aHourMin</code>
+ * @param aHourMax The crawler may only run if hour <= <code>aHourMax</code>
+ */
+ public CrawlerStatus(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
+ _crawler = aCrawler;
+ _lastExecuted = new Date();
+ _lastResult = true; // the crawler will automatically run the next day.
+ _lastException = null;
+ _lastReport = null;
+ _hourMin = aHourMin;
+ _hourMax = aHourMax;
+ _mustExecute = false;
+ }
+
+ /**
+ * Determines whether or not the crawler must be run the next time it is triggered.
+ * @param aMustExecute If true then the crawler will run the next time it is triggered
+ * by the scheduler.
+ */
+ public void setMustExecute(boolean aMustExecute) {
+ _mustExecute = aMustExecute;
+ }
+
+ /**
+ * Called by a scheduled job. This determines whether the crawler must be run or
+ * not. This encapsulates the rukes for retrying and scheduling the crawler.
+ * @param aDate Time at which we are executing now.
+ */
+ public void execute(Date aDate) {
+
+ if (mustExecute(aDate)) {
+ LOG.info("Executing crawler at " + aDate);
+ Report report = new Report();
+ try {
+ _crawler.execute(aDate, report);
+ _lastResult = true;
+ _lastException = null;
+ } catch (Exception e) {
+ _lastResult = false;
+ _lastException = e;
+ } finally {
+ _lastExecuted = aDate;
+ _lastReport = report;
+ }
+ }
+ }
+
+ /**
+ * Gets the time the crawler was last executed.
+ * @return Time of last execution.
+ */
+ public Date getLastExecuted() {
+ return _lastExecuted;
+ }
+
+ /**
+ * Gets the result of the last execution.
+ * @return True iff last execution was a success.
+ */
+ public boolean getLastResult() {
+ return _lastResult;
+ }
+
+ /**
+ * Gets the exception thrown by the last execution.
+ * @return null if the last execution was successful or an exception
+ * otherwise.
+ */
+ public Exception getLastException() {
+ return _lastException;
+ }
+
+ /**
+ * Gets the last report from the scheduler.
+ * @return Report.
+ */
+ public Report getLastReport() {
+ return _lastReport;
+ }
+
+ /**
+ * Determines whether or not the crawler must be run.
+ * @param aDate Current time.
+ * @return True iff the crawler must be run.
+ */
+ private boolean mustExecute(Date aDate) {
+ if (_mustExecute) {
+ _mustExecute = false;
+ return true;
+ }
+ if ( _lastExecuted == null ) {
+ return false; // crawler must be started manually at least once after deployment.
+ }
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(aDate);
+ int hour = calendar.get(Calendar.HOUR_OF_DAY);
+ if ( hour < _hourMin ) {
+ return false;
+ }
+ if (hour > _hourMax ) {
+ return false;
+ }
+
+ if ( !lastExecutionWasOnSameDay(aDate)) {
+ return true; // First execution of today.
+ }
+ // last execution was on the same day.
+ if ( !_lastResult ) {
+ return true; // last execution of today was unsuccessful, retry.
+ }
+ return false; // already run successfully today.
+ }
+
+ /**
+ * Determines if the last execution was on the same day.
+ * @param aDate Current time.
+ * @return True iff last execution was on the same day.
+ */
+ private boolean lastExecutionWasOnSameDay(Date aDate) {
+ if ( _lastExecuted == null ) {
+ return false;
+ }
+ int curDay = getDayOfYear(aDate);
+ int lastDay = getDayOfYear(_lastExecuted);
+ return curDay == lastDay; // check can be invalid only if scheduling interval is one year,
+ // which is ridiculous.
+ }
+
+ /**
+ * Gets the day of the year
+ * @param aDate Date to compute day for.
+ */
+ private int getDayOfYear(Date aDate) {
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(aDate);
+ return calendar.get(Calendar.DAY_OF_YEAR);
+ }
+}