From 616d9c8927b015cd8b652460c6227b40ee1ecd2e Mon Sep 17 00:00:00 2001 From: erik Date: Tue, 25 Apr 2006 19:21:16 +0000 Subject: [PATCH] --- .classpath | 2 ++ build/header.xml | 14 ++++++++ crawler/basic/src/log4j.properties | 2 +- .../crawler/kiss/main/KissCrawler.java | 7 ++-- crawler/kissweb/WebRoot/WEB-INF/web.xml | 10 ++++++ crawler/kissweb/build.xml | 2 +- .../src/org.wamblee.beanfactory.properties | 2 +- .../kissweb/src/org.wamblee.crawler.kiss.xml | 8 +++-- .../kiss/scheduling/CrawlerExecutor.java | 6 ++++ .../kiss/scheduling/CrawlerSchedule.java | 33 +++++++++++++---- .../quartz/QuartzCrawlerScheduler.java | 36 +++++++++++++++++-- .../crawler/kiss/servlet/Application.java | 24 +++++-------- 12 files changed, 112 insertions(+), 34 deletions(-) diff --git a/.classpath b/.classpath index 67c79631..14a82156 100644 --- a/.classpath +++ b/.classpath @@ -61,5 +61,7 @@ + + diff --git a/build/header.xml b/build/header.xml index 29b6aa95..537f84f4 100644 --- a/build/header.xml +++ b/build/header.xml @@ -96,6 +96,20 @@ + + + + + + + + + + + + + + diff --git a/crawler/basic/src/log4j.properties b/crawler/basic/src/log4j.properties index 65256149..ab710b36 100644 --- a/crawler/basic/src/log4j.properties +++ b/crawler/basic/src/log4j.properties @@ -10,7 +10,7 @@ log4j.rootLogger=ERROR, console # Log level for wamblee.org -log4j.logger.org.wamblee=INFO +log4j.logger.org.wamblee=DEBUG log4j.logger.org.wamblee.usermgt.UserAdministrationImplTest=INFO log4j.logger.org.wamblee.security.authorization=ERROR log4j.logger.org.wamblee.cache=INFO diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java index a9a8097a..969c5b23 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java @@ -65,7 +65,7 @@ public class KissCrawler { /** * Default socket timeout to use. */ - private static final int SOCKET_TIMEOUT = 20000; + private static final int SOCKET_TIMEOUT = 10000; /** * Regular expression for matching time interval strings in the retrieved @@ -107,7 +107,7 @@ public class KissCrawler { * In case of problems sending a mail notification. */ public KissCrawler(String aCrawlerConfig, - String aProgramConfig) throws IOException, NotificationException { + String aProgramConfig) throws IOException, NotificationException, PageException { this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig); } @@ -130,7 +130,7 @@ public class KissCrawler { * In case of problems sending a mail notification. */ public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig, - String aProgramConfig) throws IOException, NotificationException { + String aProgramConfig) throws IOException, NotificationException, PageException { _pattern = Pattern.compile(TIME_REGEX); @@ -162,6 +162,7 @@ public class KissCrawler { } catch (PageException e) { report.addMessage("Problem getting TV guide", e); LOG.info("Problem getting TV guide", e); + throw e; } parser.getNotifier().send(report.asXml()); } finally { diff --git a/crawler/kissweb/WebRoot/WEB-INF/web.xml b/crawler/kissweb/WebRoot/WEB-INF/web.xml index 3239a34a..2ff8779f 100644 --- a/crawler/kissweb/WebRoot/WEB-INF/web.xml +++ b/crawler/kissweb/WebRoot/WEB-INF/web.xml @@ -8,4 +8,14 @@ org.wamblee.crawler.kiss.servlet.Application + + + CrawlerServlet + org.wamblee.crawler.kiss.servlet.CrawlerServlet + + + + CrawlerServlet + / + diff --git a/crawler/kissweb/build.xml b/crawler/kissweb/build.xml index aa400c7b..d1987984 100644 --- a/crawler/kissweb/build.xml +++ b/crawler/kissweb/build.xml @@ -22,7 +22,7 @@ &kisscrawlerdeps; + depends="kisscrawler.src.d,servletapi.d,wamblee.kisscrawler.d,quartz.d,spring.d,jstl.d"> - 3600 @@ -21,8 +21,10 @@ - 5 - 16 + + 5 + 24 \ No newline at end of file diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java index e07d90a6..428ba983 100644 --- a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java @@ -25,5 +25,11 @@ import java.util.Date; * */ public interface CrawlerExecutor { + + /** + * Executes the crawler. + * @param aDate Date the crawler is being triggered. + * @throws Exception + */ void execute(Date aDate) throws Exception; } diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java index e341169a..5121f920 100644 --- a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java @@ -43,14 +43,20 @@ public class CrawlerSchedule implements Serializable { /** * Constructs the scheduler. - * @param aCrawler The interface by which the crawler is executed. + * The crawler will run if it is triggered in the range between the minimum (included) + * and maximum (included) hour of the day if either + *
    + *
  • it is triggered for the first time on the current day.
  • + *
  • an earlier crawling attempt on the same day failed.
  • + *
+ * @param aCrawler The interface through which the crawler is executed. * @param aHourMin The crawler may only run if hour >= aHourMin * @param aHourMax The crawler may only run if hour <= aHourMax */ public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { _crawler = aCrawler; - _lastExecuted = null; - _lastResult = false; + _lastExecuted = new Date(); + _lastResult = true; // the crawler will automatically run the next day. _lastException = null; _hourMin = aHourMin; _hourMax = aHourMax; @@ -62,6 +68,7 @@ public class CrawlerSchedule implements Serializable { * @param aDate Time at which we are executing now. */ public void execute(Date aDate) { + if (mustExecute(aDate)) { LOG.info("Executing crawler at " + aDate); try { @@ -85,6 +92,10 @@ public class CrawlerSchedule implements Serializable { return _lastExecuted; } + public void setLastExecuted(Date aDate) { + _lastExecuted = aDate; + } + /** * Gets the result of the last execution. * @return True iff last execution was a success. @@ -108,11 +119,14 @@ public class CrawlerSchedule implements Serializable { * @return True iff the crawler must be run. */ private boolean mustExecute(Date aDate) { + if ( _lastExecuted == null ) { + return false; // crawler must be started manually at least once after deployment. + } Calendar calendar = Calendar.getInstance(); calendar.setTime(aDate); int hour = calendar.get(Calendar.HOUR_OF_DAY); if ( hour < _hourMin ) { - return false; + return false; } if (hour > _hourMax ) { return false; @@ -128,17 +142,24 @@ public class CrawlerSchedule implements Serializable { return false; // already run successfully today. } + /** + * Determines if the last execution was on the same day. + * @param aDate Current time. + * @return True iff last execution was on the same day. + */ private boolean lastExecutionWasOnSameDay(Date aDate) { if ( _lastExecuted == null ) { return false; } int curDay = getDayOfYear(aDate); int lastDay = getDayOfYear(_lastExecuted); - return curDay == lastDay; + return curDay == lastDay; // check can be invalid only if scheduling interval is one year, + // which is ridiculous. } /** - * @param aDate + * Gets the day of the year + * @param aDate Date to compute day for. */ private int getDayOfYear(Date aDate) { Calendar calendar = Calendar.getInstance(); diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java index 94201f37..5fa7cf86 100644 --- a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java @@ -17,20 +17,33 @@ package org.wamblee.crawler.kiss.scheduling.quartz; import java.util.Date; +import java.util.List; import org.quartz.JobDetail; import org.quartz.Scheduler; import org.quartz.SchedulerException; import org.quartz.SchedulerFactory; +import org.quartz.SimpleTrigger; import org.quartz.Trigger; import org.quartz.TriggerUtils; import org.quartz.impl.StdSchedulerFactory; +import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler; /** * Interface to the Quartz scheduler. */ -public class QuartzCrawlerScheduler { +public class QuartzCrawlerScheduler implements CrawlerScheduler { + /** + * + */ + private static final String TRIGGER_NAME = "interval"; + + /** + * + */ + private static final String JOB_NAME = "kisscrawler"; + private Scheduler _scheduler; private int _intervalInSeconds; @@ -53,15 +66,32 @@ public class QuartzCrawlerScheduler { public void initialize() throws SchedulerException { _scheduler.start(); - JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class); + JobDetail jobDetail = new JobDetail(JOB_NAME, null, CrawlerJob.class); Trigger trigger = TriggerUtils.makeSecondlyTrigger(_intervalInSeconds); //trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date())); trigger.setStartTime(new Date()); - trigger.setName("hourly"); + trigger.setName(TRIGGER_NAME); _scheduler.scheduleJob(jobDetail, trigger); } + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler#isCrawlerRunning() + */ + public boolean isCrawlerRunning() throws Exception { + List jobs = _scheduler.getCurrentlyExecutingJobs(); + return jobs.size() > 0; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler#scheduleNow() + */ + public void scheduleNow() throws Exception { + Trigger trigger = new SimpleTrigger("immediate", null); + trigger.setJobName(JOB_NAME); + _scheduler.scheduleJob(trigger); + } + /** * Shuts down the scheduler. * @throws SchedulerException diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java index d8be2bf7..19d3b6fa 100644 --- a/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java @@ -16,21 +16,11 @@ package org.wamblee.crawler.kiss.servlet; -import java.util.Date; - import javax.servlet.ServletContextEvent; import javax.servlet.ServletContextListener; -import org.quartz.JobDetail; -import org.quartz.Scheduler; import org.quartz.SchedulerException; -import org.quartz.SchedulerFactory; -import org.quartz.Trigger; -import org.quartz.TriggerUtils; -import org.quartz.core.QuartzScheduler; -import org.quartz.impl.StdSchedulerFactory; -import org.wamblee.crawler.kiss.scheduling.quartz.CrawlerJob; -import org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler; +import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler; import org.wamblee.general.BeanKernel; /** @@ -55,8 +45,9 @@ public class Application implements ServletContextListener { aEvent.getServletContext().log("KiSS Crawler initializing"); try { getScheduler().initialize(); - } catch (SchedulerException e) { + } catch (Exception e) { aEvent.getServletContext().log("Error scheduling job", e); + return; } aEvent.getServletContext().log("KiSS Crawler initialized"); } @@ -70,8 +61,9 @@ public class Application implements ServletContextListener { aEvent.getServletContext().log("KiSS Crawler shutting down"); try { getScheduler().shutdown(); - } catch (SchedulerException e) { + } catch (Exception e) { aEvent.getServletContext().log("Error scheduling job", e); + return; } aEvent.getServletContext().log("KiSS Crawler shut down complete"); } @@ -80,11 +72,11 @@ public class Application implements ServletContextListener { * Gets the scheduler from Spring. * @return Scheduler. */ - private QuartzCrawlerScheduler getScheduler() { - return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class); + private CrawlerScheduler getScheduler() { + return BeanKernel.getBeanFactory().find(CrawlerScheduler.class); } - public static void main(String[] aArgs) throws SchedulerException { + public static void main(String[] aArgs) throws Exception { Application application = new Application(); application.getScheduler().initialize(); } -- 2.31.1