From d588ded1f8e6e1e595301d51789b4704328c4cf0 Mon Sep 17 00:00:00 2001 From: erik Date: Mon, 24 Apr 2006 23:11:55 +0000 Subject: [PATCH] --- trunk/.classpath | 2 +- .../crawler/kiss/main/KissCrawler.java | 13 +++- .../kissweb/src/org.wamblee.crawler.kiss.xml | 10 ++- .../kiss/scheduling/CrawlerSchedule.java | 61 +++++++++++++++---- .../quartz/QuartzCrawlerScheduler.java | 13 ++-- .../crawler/kiss/servlet/Application.java | 10 ++- 6 files changed, 85 insertions(+), 24 deletions(-) diff --git a/trunk/.classpath b/trunk/.classpath index 8c50115a..67c79631 100644 --- a/trunk/.classpath +++ b/trunk/.classpath @@ -57,9 +57,9 @@ - + diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java index 976d7d15..a9a8097a 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java @@ -61,6 +61,11 @@ public class KissCrawler { * Start URL of the electronic programme guide. */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; + + /** + * Default socket timeout to use. + */ + private static final int SOCKET_TIMEOUT = 20000; /** * Regular expression for matching time interval strings in the retrieved @@ -84,7 +89,7 @@ public class KissCrawler { public static void main(String[] aArgs) throws Exception { String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); String programConfig = new File(aArgs[1]).getCanonicalPath(); - new KissCrawler(START_URL, crawlerConfig, programConfig); + new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig); } /** @@ -103,7 +108,7 @@ public class KissCrawler { */ public KissCrawler(String aCrawlerConfig, String aProgramConfig) throws IOException, NotificationException { - this(START_URL, aCrawlerConfig, aProgramConfig); + this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig); } @@ -114,6 +119,7 @@ public class KissCrawler { * * @param aStartUrl * Start URL of the electronic programme guide. + * @param aSocketTimeout Socket timeout to use. * @param aCrawlerConfig * Configuration file for the crawler. * @param aProgramConfig @@ -123,7 +129,7 @@ public class KissCrawler { * @throws MessagingException * In case of problems sending a mail notification. */ - public KissCrawler(String aStartUrl, String aCrawlerConfig, + public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig, String aProgramConfig) throws IOException, NotificationException { _pattern = Pattern.compile(TIME_REGEX); @@ -131,6 +137,7 @@ public class KissCrawler { try { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("127.0.0.1", 3128); + client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT); XslTransformer transformer = new XslTransformer( new ClasspathUriResolver()); diff --git a/trunk/crawler/kissweb/src/org.wamblee.crawler.kiss.xml b/trunk/crawler/kissweb/src/org.wamblee.crawler.kiss.xml index 79bf2413..0137d940 100644 --- a/trunk/crawler/kissweb/src/org.wamblee.crawler.kiss.xml +++ b/trunk/crawler/kissweb/src/org.wamblee.crawler.kiss.xml @@ -3,20 +3,26 @@ + + 3600 + - path/to/config.xml - path/to/programs.xml + /home/erik/crawler/config.xml + /home/erik/crawler/programs.xml + 5 16 + \ No newline at end of file diff --git a/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java b/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java index 09ada84d..e341169a 100644 --- a/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java +++ b/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java @@ -20,6 +20,9 @@ import java.io.Serializable; import java.util.Calendar; import java.util.Date; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + /** * This class encapsulates the logic for deciding whether to * run the crawler. This provides the mechanism to keep the @@ -29,9 +32,12 @@ import java.util.Date; */ public class CrawlerSchedule implements Serializable { + private static final Log LOG = LogFactory.getLog(CrawlerSchedule.class); + private CrawlerExecutor _crawler; private Date _lastExecuted; - private Exception _lastResult; + private boolean _lastResult; + private Exception _lastException; private int _hourMin; private int _hourMax; @@ -44,7 +50,8 @@ public class CrawlerSchedule implements Serializable { public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { _crawler = aCrawler; _lastExecuted = null; - _lastResult = null; + _lastResult = false; + _lastException = null; _hourMin = aHourMin; _hourMax = aHourMax; } @@ -56,15 +63,18 @@ public class CrawlerSchedule implements Serializable { */ public void execute(Date aDate) { if (mustExecute(aDate)) { - try { - _lastResult = null; + LOG.info("Executing crawler at " + aDate); + try { _crawler.execute(aDate); + _lastResult = true; + _lastException = null; } catch (Exception e) { - _lastResult = e; + _lastResult = false; + _lastException = e; } finally { _lastExecuted = aDate; } - } + } } /** @@ -77,11 +87,19 @@ public class CrawlerSchedule implements Serializable { /** * Gets the result of the last execution. + * @return True iff last execution was a success. + */ + public boolean getLastResult() { + return _lastResult; + } + + /** + * Gets the exception thrown by the last execution. * @return null if the last execution was successful or an exception * otherwise. */ - public Exception getLastResult() { - return _lastResult; + public Exception getLastException() { + return _lastException; } /** @@ -99,13 +117,32 @@ public class CrawlerSchedule implements Serializable { if (hour > _hourMax ) { return false; } - if ( hour == _hourMin ) { + + if ( !lastExecutionWasOnSameDay(aDate)) { return true; // First execution of today. - } - if ( _lastResult != null ) { + } + // last execution was on the same day. + if ( !_lastResult ) { return true; // last execution of today was unsuccessful, retry. } - return false; // already run successfully today. } + + private boolean lastExecutionWasOnSameDay(Date aDate) { + if ( _lastExecuted == null ) { + return false; + } + int curDay = getDayOfYear(aDate); + int lastDay = getDayOfYear(_lastExecuted); + return curDay == lastDay; + } + + /** + * @param aDate + */ + private int getDayOfYear(Date aDate) { + Calendar calendar = Calendar.getInstance(); + calendar.setTime(aDate); + return calendar.get(Calendar.DAY_OF_YEAR); + } } diff --git a/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java b/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java index 9458d1c3..94201f37 100644 --- a/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java +++ b/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java @@ -32,14 +32,18 @@ import org.quartz.impl.StdSchedulerFactory; public class QuartzCrawlerScheduler { private Scheduler _scheduler; + + private int _intervalInSeconds; /** * Constructs the quartz interface. + * @param aIntervalInSeconds Scheduling interval in seconds. * @throws SchedulerException */ - public QuartzCrawlerScheduler() throws SchedulerException { + public QuartzCrawlerScheduler(int aIntervalInSeconds) throws SchedulerException { SchedulerFactory schedulerFactory = new StdSchedulerFactory(); _scheduler = schedulerFactory.getScheduler(); + _intervalInSeconds = aIntervalInSeconds; } /** @@ -50,10 +54,9 @@ public class QuartzCrawlerScheduler { _scheduler.start(); JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class); - jobDetail.getJobDataMap().put("count", 0); - - Trigger trigger = TriggerUtils.makeHourlyTrigger(); - trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date())); + Trigger trigger = TriggerUtils.makeSecondlyTrigger(_intervalInSeconds); + //trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date())); + trigger.setStartTime(new Date()); trigger.setName("hourly"); _scheduler.scheduleJob(jobDetail, trigger); diff --git a/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java b/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java index a4e78f74..d8be2bf7 100644 --- a/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java +++ b/trunk/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java @@ -37,7 +37,11 @@ import org.wamblee.general.BeanKernel; * The mechanism for kick starting the scheduling of the KiSS crawler. */ public class Application implements ServletContextListener { - + + /** + * Constructs the listener. + * + */ public Application() { // Empty. } @@ -72,6 +76,10 @@ public class Application implements ServletContextListener { aEvent.getServletContext().log("KiSS Crawler shut down complete"); } + /** + * Gets the scheduler from Spring. + * @return Scheduler. + */ private QuartzCrawlerScheduler getScheduler() { return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class); } -- 2.31.1