X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkissweb%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fscheduling%2FCrawlerStatus.java;fp=crawler%2Fkissweb%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fscheduling%2FCrawlerStatus.java;h=065a229487b6d87fded759f7ba5e03fb2da6fba8;hb=62f165891f08ae532b5a794af11d7338a93f9a43;hp=0000000000000000000000000000000000000000;hpb=07cedd3f0730646ea35a7f668b3e1e872a4605d9;p=utils diff --git a/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java b/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java new file mode 100644 index 00000000..065a2294 --- /dev/null +++ b/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java @@ -0,0 +1,193 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.scheduling; + +import java.io.Serializable; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.wamblee.crawler.kiss.main.Report; + +/** + * This class encapsulates the logic for deciding whether to + * run the crawler. This provides the mechanism to keep the + * scheduler simple (e.g. scheduling every hour) and providing + * more complex logic for determining whether to run the + * crawler. + */ +public class CrawlerStatus implements Serializable { + + private static final Log LOG = LogFactory.getLog(CrawlerStatus.class); + + private CrawlerExecutor _crawler; + private Date _lastExecuted; + private boolean _lastResult; + private Exception _lastException; + private Report _lastReport; + private int _hourMin; + private int _hourMax; + private boolean _mustExecute; + + /** + * Constructs the scheduler. + * The crawler will run if it is triggered in the range between the minimum (included) + * and maximum (included) hour of the day if either + * + * @param aCrawler The interface through which the crawler is executed. + * @param aHourMin The crawler may only run if hour >= aHourMin + * @param aHourMax The crawler may only run if hour <= aHourMax + */ + public CrawlerStatus(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { + _crawler = aCrawler; + _lastExecuted = new Date(); + _lastResult = true; // the crawler will automatically run the next day. + _lastException = null; + _lastReport = null; + _hourMin = aHourMin; + _hourMax = aHourMax; + _mustExecute = false; + } + + /** + * Determines whether or not the crawler must be run the next time it is triggered. + * @param aMustExecute If true then the crawler will run the next time it is triggered + * by the scheduler. + */ + public void setMustExecute(boolean aMustExecute) { + _mustExecute = aMustExecute; + } + + /** + * Called by a scheduled job. This determines whether the crawler must be run or + * not. This encapsulates the rukes for retrying and scheduling the crawler. + * @param aDate Time at which we are executing now. + */ + public void execute(Date aDate) { + + if (mustExecute(aDate)) { + LOG.info("Executing crawler at " + aDate); + Report report = new Report(); + try { + _crawler.execute(aDate, report); + _lastResult = true; + _lastException = null; + } catch (Exception e) { + _lastResult = false; + _lastException = e; + } finally { + _lastExecuted = aDate; + _lastReport = report; + } + } + } + + /** + * Gets the time the crawler was last executed. + * @return Time of last execution. + */ + public Date getLastExecuted() { + return _lastExecuted; + } + + /** + * Gets the result of the last execution. + * @return True iff last execution was a success. + */ + public boolean getLastResult() { + return _lastResult; + } + + /** + * Gets the exception thrown by the last execution. + * @return null if the last execution was successful or an exception + * otherwise. + */ + public Exception getLastException() { + return _lastException; + } + + /** + * Gets the last report from the scheduler. + * @return Report. + */ + public Report getLastReport() { + return _lastReport; + } + + /** + * Determines whether or not the crawler must be run. + * @param aDate Current time. + * @return True iff the crawler must be run. + */ + private boolean mustExecute(Date aDate) { + if (_mustExecute) { + _mustExecute = false; + return true; + } + if ( _lastExecuted == null ) { + return false; // crawler must be started manually at least once after deployment. + } + Calendar calendar = Calendar.getInstance(); + calendar.setTime(aDate); + int hour = calendar.get(Calendar.HOUR_OF_DAY); + if ( hour < _hourMin ) { + return false; + } + if (hour > _hourMax ) { + return false; + } + + if ( !lastExecutionWasOnSameDay(aDate)) { + return true; // First execution of today. + } + // last execution was on the same day. + if ( !_lastResult ) { + return true; // last execution of today was unsuccessful, retry. + } + return false; // already run successfully today. + } + + /** + * Determines if the last execution was on the same day. + * @param aDate Current time. + * @return True iff last execution was on the same day. + */ + private boolean lastExecutionWasOnSameDay(Date aDate) { + if ( _lastExecuted == null ) { + return false; + } + int curDay = getDayOfYear(aDate); + int lastDay = getDayOfYear(_lastExecuted); + return curDay == lastDay; // check can be invalid only if scheduling interval is one year, + // which is ridiculous. + } + + /** + * Gets the day of the year + * @param aDate Date to compute day for. + */ + private int getDayOfYear(Date aDate) { + Calendar calendar = Calendar.getInstance(); + calendar.setTime(aDate); + return calendar.get(Calendar.DAY_OF_YEAR); + } +}