<classpathentry kind="lib" path="support/lib/external/spring-1.2.5.jar"/>
<classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/spring-1.2.5.jar"/>
<classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/quartz-1.5.1.jar"/>
+ <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/jstl-1.1.2.jar"/>
+ <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/standard-1.1.2.jar"/>
<classpathentry kind="output" path="crawler/kissweb/WebRoot/WEB-INF/classes"/>
</classpath>
</antcall>
</target>
+<target name="jstl.d">
+ <antcall target="download.dep">
+ <param name="group" value="jstl"/>
+ <param name="version" value="1.1.2"/>
+ </antcall>
+ <antcall target="download.dep">
+ <param name="group" value="taglibs"/>
+ <param name="artifact" value="standard"/>
+ <param name="version" value="1.1.2"/>
+ </antcall>
+</target>
+
+
+
<target name="quartz.d">
<antcall target="download.dep">
<param name="group" value="quartz"/>
log4j.rootLogger=ERROR, console
# Log level for wamblee.org
-log4j.logger.org.wamblee=INFO
+log4j.logger.org.wamblee=DEBUG
log4j.logger.org.wamblee.usermgt.UserAdministrationImplTest=INFO
log4j.logger.org.wamblee.security.authorization=ERROR
log4j.logger.org.wamblee.cache=INFO
/**
* Default socket timeout to use.
*/
- private static final int SOCKET_TIMEOUT = 20000;
+ private static final int SOCKET_TIMEOUT = 10000;
/**
* Regular expression for matching time interval strings in the retrieved
* In case of problems sending a mail notification.
*/
public KissCrawler(String aCrawlerConfig,
- String aProgramConfig) throws IOException, NotificationException {
+ String aProgramConfig) throws IOException, NotificationException, PageException {
this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig);
}
* In case of problems sending a mail notification.
*/
public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig,
- String aProgramConfig) throws IOException, NotificationException {
+ String aProgramConfig) throws IOException, NotificationException, PageException {
_pattern = Pattern.compile(TIME_REGEX);
} catch (PageException e) {
report.addMessage("Problem getting TV guide", e);
LOG.info("Problem getting TV guide", e);
+ throw e;
}
parser.getNotifier().send(report.asXml());
} finally {
<listener>
<listener-class>org.wamblee.crawler.kiss.servlet.Application</listener-class>
</listener>
+
+ <servlet>
+ <servlet-name>CrawlerServlet</servlet-name>
+ <servlet-class>org.wamblee.crawler.kiss.servlet.CrawlerServlet</servlet-class>
+ </servlet>
+
+ <servlet-mapping>
+ <servlet-name>CrawlerServlet</servlet-name>
+ <url-pattern>/</url-pattern>
+ </servlet-mapping>
</web-app>
&kisscrawlerdeps;
<target name="module.build.deps"
- depends="kisscrawler.src.d,servletapi.d,wamblee.kisscrawler.d,quartz.d,spring.d">
+ depends="kisscrawler.src.d,servletapi.d,wamblee.kisscrawler.d,quartz.d,spring.d,jstl.d">
</target>
<!-- Set libraries to use in addition for test, a library which
##############################################################################
-# Class name of the beanfactory used by the photos application
+# Class name of the beanfactory used by the crawler application
##############################################################################
org.wamblee.beanfactory.class=org.wamblee.crawler.kiss.spring.CrawlerBeanFactory
<beans>
<!-- The object that tells quartz how to schedule the crawler -->
- <bean id="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler"
+ <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerScheduler"
class="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler">
<constructor-arg><value type="int">3600</value></constructor-arg>
</bean>
<bean id="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule"
class="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule">
<constructor-arg><ref local="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"/></constructor-arg>
- <constructor-arg><value type="int">5</value></constructor-arg> <!-- from 5 AM -->
- <constructor-arg><value type="int">16</value></constructor-arg> <!-- to 4 PM -->
+ <!-- The interval of the day in hours [hourmin, hourmax] over which crawling will be done and
+ retried if necessary -->
+ <constructor-arg><value type="int">5</value></constructor-arg>
+ <constructor-arg><value type="int">24</value></constructor-arg>
</bean>
</beans>
\ No newline at end of file
*
*/
public interface CrawlerExecutor {
+
+ /**
+ * Executes the crawler.
+ * @param aDate Date the crawler is being triggered.
+ * @throws Exception
+ */
void execute(Date aDate) throws Exception;
}
/**
* Constructs the scheduler.
- * @param aCrawler The interface by which the crawler is executed.
+ * The crawler will run if it is triggered in the range between the minimum (included)
+ * and maximum (included) hour of the day if either
+ * <ul>
+ * <li>it is triggered for the first time on the current day.</li>
+ * <li>an earlier crawling attempt on the same day failed. </li>
+ * </ul>
+ * @param aCrawler The interface through which the crawler is executed.
* @param aHourMin The crawler may only run if hour >= <code>aHourMin</code>
* @param aHourMax The crawler may only run if hour <= <code>aHourMax</code>
*/
public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
_crawler = aCrawler;
- _lastExecuted = null;
- _lastResult = false;
+ _lastExecuted = new Date();
+ _lastResult = true; // the crawler will automatically run the next day.
_lastException = null;
_hourMin = aHourMin;
_hourMax = aHourMax;
* @param aDate Time at which we are executing now.
*/
public void execute(Date aDate) {
+
if (mustExecute(aDate)) {
LOG.info("Executing crawler at " + aDate);
try {
return _lastExecuted;
}
+ public void setLastExecuted(Date aDate) {
+ _lastExecuted = aDate;
+ }
+
/**
* Gets the result of the last execution.
* @return True iff last execution was a success.
* @return True iff the crawler must be run.
*/
private boolean mustExecute(Date aDate) {
+ if ( _lastExecuted == null ) {
+ return false; // crawler must be started manually at least once after deployment.
+ }
Calendar calendar = Calendar.getInstance();
calendar.setTime(aDate);
int hour = calendar.get(Calendar.HOUR_OF_DAY);
if ( hour < _hourMin ) {
- return false;
+ return false;
}
if (hour > _hourMax ) {
return false;
return false; // already run successfully today.
}
+ /**
+ * Determines if the last execution was on the same day.
+ * @param aDate Current time.
+ * @return True iff last execution was on the same day.
+ */
private boolean lastExecutionWasOnSameDay(Date aDate) {
if ( _lastExecuted == null ) {
return false;
}
int curDay = getDayOfYear(aDate);
int lastDay = getDayOfYear(_lastExecuted);
- return curDay == lastDay;
+ return curDay == lastDay; // check can be invalid only if scheduling interval is one year,
+ // which is ridiculous.
}
/**
- * @param aDate
+ * Gets the day of the year
+ * @param aDate Date to compute day for.
*/
private int getDayOfYear(Date aDate) {
Calendar calendar = Calendar.getInstance();
package org.wamblee.crawler.kiss.scheduling.quartz;
import java.util.Date;
+import java.util.List;
import org.quartz.JobDetail;
import org.quartz.Scheduler;
import org.quartz.SchedulerException;
import org.quartz.SchedulerFactory;
+import org.quartz.SimpleTrigger;
import org.quartz.Trigger;
import org.quartz.TriggerUtils;
import org.quartz.impl.StdSchedulerFactory;
+import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler;
/**
* Interface to the Quartz scheduler.
*/
-public class QuartzCrawlerScheduler {
+public class QuartzCrawlerScheduler implements CrawlerScheduler {
+ /**
+ *
+ */
+ private static final String TRIGGER_NAME = "interval";
+
+ /**
+ *
+ */
+ private static final String JOB_NAME = "kisscrawler";
+
private Scheduler _scheduler;
private int _intervalInSeconds;
public void initialize() throws SchedulerException {
_scheduler.start();
- JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class);
+ JobDetail jobDetail = new JobDetail(JOB_NAME, null, CrawlerJob.class);
Trigger trigger = TriggerUtils.makeSecondlyTrigger(_intervalInSeconds);
//trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date()));
trigger.setStartTime(new Date());
- trigger.setName("hourly");
+ trigger.setName(TRIGGER_NAME);
_scheduler.scheduleJob(jobDetail, trigger);
}
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler#isCrawlerRunning()
+ */
+ public boolean isCrawlerRunning() throws Exception {
+ List jobs = _scheduler.getCurrentlyExecutingJobs();
+ return jobs.size() > 0;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler#scheduleNow()
+ */
+ public void scheduleNow() throws Exception {
+ Trigger trigger = new SimpleTrigger("immediate", null);
+ trigger.setJobName(JOB_NAME);
+ _scheduler.scheduleJob(trigger);
+ }
+
/**
* Shuts down the scheduler.
* @throws SchedulerException
package org.wamblee.crawler.kiss.servlet;
-import java.util.Date;
-
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
-import org.quartz.JobDetail;
-import org.quartz.Scheduler;
import org.quartz.SchedulerException;
-import org.quartz.SchedulerFactory;
-import org.quartz.Trigger;
-import org.quartz.TriggerUtils;
-import org.quartz.core.QuartzScheduler;
-import org.quartz.impl.StdSchedulerFactory;
-import org.wamblee.crawler.kiss.scheduling.quartz.CrawlerJob;
-import org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler;
+import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler;
import org.wamblee.general.BeanKernel;
/**
aEvent.getServletContext().log("KiSS Crawler initializing");
try {
getScheduler().initialize();
- } catch (SchedulerException e) {
+ } catch (Exception e) {
aEvent.getServletContext().log("Error scheduling job", e);
+ return;
}
aEvent.getServletContext().log("KiSS Crawler initialized");
}
aEvent.getServletContext().log("KiSS Crawler shutting down");
try {
getScheduler().shutdown();
- } catch (SchedulerException e) {
+ } catch (Exception e) {
aEvent.getServletContext().log("Error scheduling job", e);
+ return;
}
aEvent.getServletContext().log("KiSS Crawler shut down complete");
}
* Gets the scheduler from Spring.
* @return Scheduler.
*/
- private QuartzCrawlerScheduler getScheduler() {
- return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class);
+ private CrawlerScheduler getScheduler() {
+ return BeanKernel.getBeanFactory().find(CrawlerScheduler.class);
}
- public static void main(String[] aArgs) throws SchedulerException {
+ public static void main(String[] aArgs) throws Exception {
Application application = new Application();
application.getScheduler().initialize();
}