<classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/wamblee-support.jar"/>
<classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/xerces-2.4.0.jar"/>
<classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/servletapi-2.4.jar"/>
- <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/quartz-1.5.2.jar"/>
<classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/wamblee-crawler-kiss.jar"/>
<classpathentry kind="lib" path="support/lib/external/spring-1.2.5.jar"/>
<classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/spring-1.2.5.jar"/>
+ <classpathentry kind="lib" path="crawler/kissweb/WebRoot/WEB-INF/lib/quartz-1.5.1.jar"/>
<classpathentry kind="output" path="crawler/kissweb/WebRoot/WEB-INF/classes"/>
</classpath>
* Start URL of the electronic programme guide.
*/
private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
+
+ /**
+ * Default socket timeout to use.
+ */
+ private static final int SOCKET_TIMEOUT = 20000;
/**
* Regular expression for matching time interval strings in the retrieved
public static void main(String[] aArgs) throws Exception {
String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
String programConfig = new File(aArgs[1]).getCanonicalPath();
- new KissCrawler(START_URL, crawlerConfig, programConfig);
+ new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig);
}
/**
*/
public KissCrawler(String aCrawlerConfig,
String aProgramConfig) throws IOException, NotificationException {
- this(START_URL, aCrawlerConfig, aProgramConfig);
+ this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig);
}
*
* @param aStartUrl
* Start URL of the electronic programme guide.
+ * @param aSocketTimeout Socket timeout to use.
* @param aCrawlerConfig
* Configuration file for the crawler.
* @param aProgramConfig
* @throws MessagingException
* In case of problems sending a mail notification.
*/
- public KissCrawler(String aStartUrl, String aCrawlerConfig,
+ public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig,
String aProgramConfig) throws IOException, NotificationException {
_pattern = Pattern.compile(TIME_REGEX);
try {
HttpClient client = new HttpClient();
// client.getHostConfiguration().setProxy("127.0.0.1", 3128);
+ client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT);
XslTransformer transformer = new XslTransformer(
new ClasspathUriResolver());
<beans>
+ <!-- The object that tells quartz how to schedule the crawler -->
<bean id="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler"
class="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler">
+ <constructor-arg><value type="int">3600</value></constructor-arg>
</bean>
+ <!-- The object which executes the crawler -->
<bean id="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"
class="org.wamblee.crawler.kiss.scheduling.CrawlerExecutorImpl">
- <constructor-arg><value>path/to/config.xml</value></constructor-arg>
- <constructor-arg><value>path/to/programs.xml</value></constructor-arg>
+ <constructor-arg><value>/home/erik/crawler/config.xml</value></constructor-arg>
+ <constructor-arg><value>/home/erik/crawler/programs.xml</value></constructor-arg>
</bean>
+ <!-- The object that determines whether to execute the crawler when it is signalled by
+ the scheduler. -->
<bean id="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule"
class="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule">
<constructor-arg><ref local="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"/></constructor-arg>
<constructor-arg><value type="int">5</value></constructor-arg> <!-- from 5 AM -->
<constructor-arg><value type="int">16</value></constructor-arg> <!-- to 4 PM -->
</bean>
+
</beans>
\ No newline at end of file
import java.util.Calendar;
import java.util.Date;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
/**
* This class encapsulates the logic for deciding whether to
* run the crawler. This provides the mechanism to keep the
*/
public class CrawlerSchedule implements Serializable {
+ private static final Log LOG = LogFactory.getLog(CrawlerSchedule.class);
+
private CrawlerExecutor _crawler;
private Date _lastExecuted;
- private Exception _lastResult;
+ private boolean _lastResult;
+ private Exception _lastException;
private int _hourMin;
private int _hourMax;
public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
_crawler = aCrawler;
_lastExecuted = null;
- _lastResult = null;
+ _lastResult = false;
+ _lastException = null;
_hourMin = aHourMin;
_hourMax = aHourMax;
}
*/
public void execute(Date aDate) {
if (mustExecute(aDate)) {
- try {
- _lastResult = null;
+ LOG.info("Executing crawler at " + aDate);
+ try {
_crawler.execute(aDate);
+ _lastResult = true;
+ _lastException = null;
} catch (Exception e) {
- _lastResult = e;
+ _lastResult = false;
+ _lastException = e;
} finally {
_lastExecuted = aDate;
}
- }
+ }
}
/**
/**
* Gets the result of the last execution.
+ * @return True iff last execution was a success.
+ */
+ public boolean getLastResult() {
+ return _lastResult;
+ }
+
+ /**
+ * Gets the exception thrown by the last execution.
* @return null if the last execution was successful or an exception
* otherwise.
*/
- public Exception getLastResult() {
- return _lastResult;
+ public Exception getLastException() {
+ return _lastException;
}
/**
if (hour > _hourMax ) {
return false;
}
- if ( hour == _hourMin ) {
+
+ if ( !lastExecutionWasOnSameDay(aDate)) {
return true; // First execution of today.
- }
- if ( _lastResult != null ) {
+ }
+ // last execution was on the same day.
+ if ( !_lastResult ) {
return true; // last execution of today was unsuccessful, retry.
}
-
return false; // already run successfully today.
}
+
+ private boolean lastExecutionWasOnSameDay(Date aDate) {
+ if ( _lastExecuted == null ) {
+ return false;
+ }
+ int curDay = getDayOfYear(aDate);
+ int lastDay = getDayOfYear(_lastExecuted);
+ return curDay == lastDay;
+ }
+
+ /**
+ * @param aDate
+ */
+ private int getDayOfYear(Date aDate) {
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(aDate);
+ return calendar.get(Calendar.DAY_OF_YEAR);
+ }
}
public class QuartzCrawlerScheduler {
private Scheduler _scheduler;
+
+ private int _intervalInSeconds;
/**
* Constructs the quartz interface.
+ * @param aIntervalInSeconds Scheduling interval in seconds.
* @throws SchedulerException
*/
- public QuartzCrawlerScheduler() throws SchedulerException {
+ public QuartzCrawlerScheduler(int aIntervalInSeconds) throws SchedulerException {
SchedulerFactory schedulerFactory = new StdSchedulerFactory();
_scheduler = schedulerFactory.getScheduler();
+ _intervalInSeconds = aIntervalInSeconds;
}
/**
_scheduler.start();
JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class);
- jobDetail.getJobDataMap().put("count", 0);
-
- Trigger trigger = TriggerUtils.makeHourlyTrigger();
- trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date()));
+ Trigger trigger = TriggerUtils.makeSecondlyTrigger(_intervalInSeconds);
+ //trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date()));
+ trigger.setStartTime(new Date());
trigger.setName("hourly");
_scheduler.scheduleJob(jobDetail, trigger);
* The mechanism for kick starting the scheduling of the KiSS crawler.
*/
public class Application implements ServletContextListener {
-
+
+ /**
+ * Constructs the listener.
+ *
+ */
public Application() {
// Empty.
}
aEvent.getServletContext().log("KiSS Crawler shut down complete");
}
+ /**
+ * Gets the scheduler from Spring.
+ * @return Scheduler.
+ */
private QuartzCrawlerScheduler getScheduler() {
return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class);
}