Moved crawler to

[utils] / crawler / kissweb / src / main / java / org / wamblee / crawler / kiss / scheduling / CrawlerStatus.java
diff --git a/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java b/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java

deleted file mode 100644 (file)

index 065a229..0000000
--- a/crawler/kissweb/src/main/java/org/wamblee/crawler/kiss/scheduling/CrawlerStatus.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright 2006 the original author or authors.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *      http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */ 
-
-package org.wamblee.crawler.kiss.scheduling;
-
-import java.io.Serializable;
-import java.util.Calendar;
-import java.util.Date;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.wamblee.crawler.kiss.main.Report;
-
-/**
- * This class encapsulates the logic for deciding whether to
- * run the crawler. This provides the mechanism to keep the 
- * scheduler simple (e.g. scheduling every hour) and providing
- * more complex logic for determining whether to run the 
- * crawler. 
- */
-public class CrawlerStatus implements Serializable {
-    
-    private static final Log LOG = LogFactory.getLog(CrawlerStatus.class);
-    
-    private CrawlerExecutor _crawler; 
-    private Date _lastExecuted;
-    private boolean _lastResult;
-    private Exception _lastException;
-    private Report _lastReport; 
-    private int _hourMin; 
-    private int _hourMax;
-    private boolean _mustExecute; 
-    
-    /**
-     * Constructs the scheduler.
-     * The crawler will run if it is triggered in the range between the minimum (included)
-     * and maximum (included) hour of the day if either
-     * <ul>
-     *   <li>it is triggered for the first time on the current day.</li>
-     *   <li>an earlier crawling attempt on the same day failed. </li>
-     * </ul>
-     * @param aCrawler The interface through which the crawler is executed.
-     * @param aHourMin The crawler may only run if hour &gt;= <code>aHourMin</code>
-     * @param aHourMax The crawler may only run if hour &lt;= <code>aHourMax</code>
-     */
-    public CrawlerStatus(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { 
-        _crawler = aCrawler; 
-        _lastExecuted = new Date(); 
-        _lastResult = true; // the crawler will automatically run the next day. 
-        _lastException = null;
-        _lastReport = null; 
-        _hourMin = aHourMin;
-        _hourMax = aHourMax;
-        _mustExecute = false; 
-    }
-    
-    /**
-     * Determines whether or not the crawler must be run the next time it is triggered.
-     * @param aMustExecute If true then the crawler will run the next time it is triggered
-     *   by the scheduler. 
-     */
-    public void setMustExecute(boolean aMustExecute) { 
-        _mustExecute = aMustExecute; 
-    }
-    
-    /**
-     * Called by a scheduled job. This determines whether the crawler must be run or 
-     * not. This encapsulates the rukes for retrying and scheduling the crawler. 
-     * @param aDate Time at which we are executing now. 
-     */
-    public void execute(Date aDate) { 
-        
-        if (mustExecute(aDate)) { 
-            LOG.info("Executing crawler at " + aDate);
-            Report report = new Report();
-            try {
-                _crawler.execute(aDate, report);
-                _lastResult = true; 
-                _lastException = null; 
-            } catch (Exception e) {
-                _lastResult = false; 
-                _lastException = e; 
-            } finally { 
-                _lastExecuted = aDate;
-                _lastReport = report;
-            }
-        } 
-    }
-    
-    /**
-     * Gets the time the crawler was last executed.
-     * @return Time of last execution. 
-     */
-    public Date getLastExecuted() { 
-        return _lastExecuted; 
-    }
-    
-    /**
-     * Gets the result of the last execution. 
-     * @return True iff last execution was a success. 
-     */
-    public boolean getLastResult() { 
-        return _lastResult; 
-    }
-    
-    /**
-     * Gets the exception thrown by the last execution. 
-     * @return null if the last execution was successful or an exception 
-     *   otherwise.
-     */
-    public Exception getLastException() { 
-        return _lastException; 
-    }
-    
-    /**
-     * Gets the last report from the scheduler. 
-     * @return Report. 
-     */
-    public Report getLastReport() { 
-        return _lastReport; 
-    }
-    
-    /**
-     * Determines whether or not the crawler must be run. 
-     * @param aDate Current time. 
-     * @return True iff the crawler must be run. 
-     */
-    private boolean mustExecute(Date aDate) {
-        if (_mustExecute) { 
-            _mustExecute = false;
-            return true; 
-        }
-        if ( _lastExecuted == null ) {
-            return false; // crawler must be started manually at least once after deployment.
-        }
-        Calendar calendar = Calendar.getInstance();
-        calendar.setTime(aDate);
-        int hour = calendar.get(Calendar.HOUR_OF_DAY);
-        if ( hour < _hourMin ) {
-            return false; 
-        }
-        if (hour > _hourMax ) { 
-            return false; 
-        }
-     
-        if ( !lastExecutionWasOnSameDay(aDate)) { 
-            return true; // First execution of today. 
-        } 
-        // last execution was on the same day.
-        if ( !_lastResult ) { 
-            return true; // last execution of today was unsuccessful, retry. 
-        }
-        return false; // already run successfully today.
-    }
-    
-    /**
-     * Determines if the last execution was on the same day. 
-     * @param aDate Current time. 
-     * @return True iff last execution was on the same day. 
-     */
-    private boolean lastExecutionWasOnSameDay(Date aDate) {
-        if ( _lastExecuted == null ) { 
-            return false;
-        }
-        int curDay = getDayOfYear(aDate);
-        int lastDay = getDayOfYear(_lastExecuted);
-        return curDay == lastDay; // check can be invalid only if scheduling interval is one year,
-                                  // which is ridiculous.
-    }
-
-    /**
-     * Gets the day of the year
-     * @param aDate Date to compute day for. 
-     */
-    private int getDayOfYear(Date aDate) {
-        Calendar calendar = Calendar.getInstance();
-        calendar.setTime(aDate);
-        return calendar.get(Calendar.DAY_OF_YEAR);
-    }
-}