X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkissweb%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fscheduling%2FCrawlerStatus.java;fp=crawler%2Fkissweb%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fscheduling%2FCrawlerStatus.java;h=0000000000000000000000000000000000000000;hb=2ef561a4fbf29b65335f1558bfebd74733a6ddd9;hp=065a229487b6d87fded759f7ba5e03fb2da6fba8;hpb=54903ea538a09fdb1e2ee6dc37e89bb85aebfec4;p=utils diff --git a/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java b/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java deleted file mode 100644 index 065a2294..00000000 --- a/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright 2006 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.wamblee.crawler.kiss.scheduling; - -import java.io.Serializable; -import java.util.Calendar; -import java.util.Date; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.wamblee.crawler.kiss.main.Report; - -/** - * This class encapsulates the logic for deciding whether to - * run the crawler. This provides the mechanism to keep the - * scheduler simple (e.g. scheduling every hour) and providing - * more complex logic for determining whether to run the - * crawler. - */ -public class CrawlerStatus implements Serializable { - - private static final Log LOG = LogFactory.getLog(CrawlerStatus.class); - - private CrawlerExecutor _crawler; - private Date _lastExecuted; - private boolean _lastResult; - private Exception _lastException; - private Report _lastReport; - private int _hourMin; - private int _hourMax; - private boolean _mustExecute; - - /** - * Constructs the scheduler. - * The crawler will run if it is triggered in the range between the minimum (included) - * and maximum (included) hour of the day if either - *
aHourMin
- * @param aHourMax The crawler may only run if hour <= aHourMax
- */
- public CrawlerStatus(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
- _crawler = aCrawler;
- _lastExecuted = new Date();
- _lastResult = true; // the crawler will automatically run the next day.
- _lastException = null;
- _lastReport = null;
- _hourMin = aHourMin;
- _hourMax = aHourMax;
- _mustExecute = false;
- }
-
- /**
- * Determines whether or not the crawler must be run the next time it is triggered.
- * @param aMustExecute If true then the crawler will run the next time it is triggered
- * by the scheduler.
- */
- public void setMustExecute(boolean aMustExecute) {
- _mustExecute = aMustExecute;
- }
-
- /**
- * Called by a scheduled job. This determines whether the crawler must be run or
- * not. This encapsulates the rukes for retrying and scheduling the crawler.
- * @param aDate Time at which we are executing now.
- */
- public void execute(Date aDate) {
-
- if (mustExecute(aDate)) {
- LOG.info("Executing crawler at " + aDate);
- Report report = new Report();
- try {
- _crawler.execute(aDate, report);
- _lastResult = true;
- _lastException = null;
- } catch (Exception e) {
- _lastResult = false;
- _lastException = e;
- } finally {
- _lastExecuted = aDate;
- _lastReport = report;
- }
- }
- }
-
- /**
- * Gets the time the crawler was last executed.
- * @return Time of last execution.
- */
- public Date getLastExecuted() {
- return _lastExecuted;
- }
-
- /**
- * Gets the result of the last execution.
- * @return True iff last execution was a success.
- */
- public boolean getLastResult() {
- return _lastResult;
- }
-
- /**
- * Gets the exception thrown by the last execution.
- * @return null if the last execution was successful or an exception
- * otherwise.
- */
- public Exception getLastException() {
- return _lastException;
- }
-
- /**
- * Gets the last report from the scheduler.
- * @return Report.
- */
- public Report getLastReport() {
- return _lastReport;
- }
-
- /**
- * Determines whether or not the crawler must be run.
- * @param aDate Current time.
- * @return True iff the crawler must be run.
- */
- private boolean mustExecute(Date aDate) {
- if (_mustExecute) {
- _mustExecute = false;
- return true;
- }
- if ( _lastExecuted == null ) {
- return false; // crawler must be started manually at least once after deployment.
- }
- Calendar calendar = Calendar.getInstance();
- calendar.setTime(aDate);
- int hour = calendar.get(Calendar.HOUR_OF_DAY);
- if ( hour < _hourMin ) {
- return false;
- }
- if (hour > _hourMax ) {
- return false;
- }
-
- if ( !lastExecutionWasOnSameDay(aDate)) {
- return true; // First execution of today.
- }
- // last execution was on the same day.
- if ( !_lastResult ) {
- return true; // last execution of today was unsuccessful, retry.
- }
- return false; // already run successfully today.
- }
-
- /**
- * Determines if the last execution was on the same day.
- * @param aDate Current time.
- * @return True iff last execution was on the same day.
- */
- private boolean lastExecutionWasOnSameDay(Date aDate) {
- if ( _lastExecuted == null ) {
- return false;
- }
- int curDay = getDayOfYear(aDate);
- int lastDay = getDayOfYear(_lastExecuted);
- return curDay == lastDay; // check can be invalid only if scheduling interval is one year,
- // which is ridiculous.
- }
-
- /**
- * Gets the day of the year
- * @param aDate Date to compute day for.
- */
- private int getDayOfYear(Date aDate) {
- Calendar calendar = Calendar.getInstance();
- calendar.setTime(aDate);
- return calendar.get(Calendar.DAY_OF_YEAR);
- }
-}