--- /dev/null
+Manifest-Version: 1.0\r
+Class-Path: \r
+\r
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<web-app version="2.4"
+ xmlns="http://java.sun.com/xml/ns/j2ee"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://java.sun.com/xml/ns/j2ee
+ http://java.sun.com/xml/ns/j2ee/web-app_2_4.xsd">
+
+ <listener>
+ <listener-class>org.wamblee.crawler.kiss.servlet.Application</listener-class>
+ </listener>
+</web-app>
--- /dev/null
+<?xml version="1.0"?>
+
+<!DOCTYPE project [
+ <!ENTITY header SYSTEM "file:../../build/header.xml">
+ <!ENTITY trailer SYSTEM "file:../../build/trailer.xml">
+ <!ENTITY crawlerdeps SYSTEM "file:../basic/deps.xml">
+ <!ENTITY kisscrawlerdeps SYSTEM "file:../kiss/deps.xml">
+]>
+
+<project name="crawler" default="jar" basedir=".">
+
+
+ <!-- =============================================================================== -->
+ <!-- Include the build header defining general properties -->
+ <!-- =============================================================================== -->
+ <property name="project.home" value="../.."/>
+ <property name="module.name" value="wamblee-crawler-kissweb" />
+ <property name="webroot.dir" value="WebRoot"/>
+
+ &header;
+ &crawlerdeps;
+ &kisscrawlerdeps;
+
+ <target name="module.build.deps"
+ depends="kisscrawler.src.d,servletapi.d,wamblee.kisscrawler.d,quartz.d,spring.d">
+ </target>
+
+ <!-- Set libraries to use in addition for test, a library which
+ is already mentioned in module.build.path should not be
+ mentioned below again -->
+ <target name="module.test.deps" depends="kisscrawler.test.d,wamblee.kisscrawler.test.d">
+ </target>
+
+ &trailer;
+
+
+</project>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd">
+
+ <beans>
+
+ <bean id="crawler"
+ class="org.springframework.context.support.ClassPathXmlApplicationContext">
+ <constructor-arg>
+ <list>
+ <value>org.wamblee.crawler.kiss.xml</value>
+ </list>
+ </constructor-arg>
+ </bean>
+
+ </beans>
\ No newline at end of file
--- /dev/null
+
+##############################################################################
+# Class name of the beanfactory used by the photos application
+##############################################################################
+
+org.wamblee.beanfactory.class=org.wamblee.crawler.kiss.spring.CrawlerBeanFactory
+
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd">
+
+<beans>
+
+ <bean id="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler"
+ class="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler">
+ </bean>
+
+ <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"
+ class="org.wamblee.crawler.kiss.scheduling.CrawlerExecutorImpl">
+ <constructor-arg><value>path/to/config.xml</value></constructor-arg>
+ <constructor-arg><value>path/to/programs.xml</value></constructor-arg>
+ </bean>
+
+ <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule"
+ class="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule">
+ <constructor-arg><ref local="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"/></constructor-arg>
+ <constructor-arg><value type="int">5</value></constructor-arg> <!-- from 5 AM -->
+ <constructor-arg><value type="int">16</value></constructor-arg> <!-- to 4 PM -->
+ </bean>
+</beans>
\ No newline at end of file
--- /dev/null
+/*
+ * Copyright 2006 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.scheduling;
+
+import java.util.Date;
+
+/**
+ * Encapsulates the actual execution of the crawler.
+ * This interface makes it possible to test the scheduling logic
+ * in isolation.
+ *
+ */
+public interface CrawlerExecutor {
+ void execute(Date aDate) throws Exception;
+}
--- /dev/null
+/*
+ * Copyright 2006 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.scheduling;
+
+import java.util.Date;
+
+import org.wamblee.crawler.kiss.main.KissCrawler;
+
+/**
+ * Implementation which executes the KiSS crawler for retrieving web content.
+ */
+public class CrawlerExecutorImpl implements CrawlerExecutor {
+
+ private String _crawlerConfig;
+ private String _programConfig;
+
+ /**
+ * Constructs the crawler executor.
+ * @param aCrawlerConfig Crawler configuration file.
+ * @param aProgramConfig Program configuration file.
+ */
+ public CrawlerExecutorImpl(String aCrawlerConfig, String aProgramConfig) {
+ _crawlerConfig = aCrawlerConfig;
+ _programConfig = aProgramConfig;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler.CrawlerExecutor#execute(java.util.Date)
+ */
+ public void execute(Date aDate) throws Exception {
+ KissCrawler crawler = new KissCrawler(_crawlerConfig, _programConfig);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2006 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.scheduling;
+
+import java.io.Serializable;
+import java.util.Calendar;
+import java.util.Date;
+
+/**
+ * This class encapsulates the logic for deciding whether to
+ * run the crawler. This provides the mechanism to keep the
+ * scheduler simple (e.g. scheduling every hour) and providing
+ * more complex logic for determining whether to run the
+ * crawler.
+ */
+public class CrawlerSchedule implements Serializable {
+
+ private CrawlerExecutor _crawler;
+ private Date _lastExecuted;
+ private Exception _lastResult;
+ private int _hourMin;
+ private int _hourMax;
+
+ /**
+ * Constructs the scheduler.
+ * @param aCrawler The interface by which the crawler is executed.
+ * @param aHourMin The crawler may only run if hour >= <code>aHourMin</code>
+ * @param aHourMax The crawler may only run if hour <= <code>aHourMax</code>
+ */
+ public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
+ _crawler = aCrawler;
+ _lastExecuted = null;
+ _lastResult = null;
+ _hourMin = aHourMin;
+ _hourMax = aHourMax;
+ }
+
+ /**
+ * Called by a scheduled job. This determines whether the crawler must be run or
+ * not. This encapsulates the rukes for retrying and scheduling the crawler.
+ * @param aDate Time at which we are executing now.
+ */
+ public void execute(Date aDate) {
+ if (mustExecute(aDate)) {
+ try {
+ _lastResult = null;
+ _crawler.execute(aDate);
+ } catch (Exception e) {
+ _lastResult = e;
+ } finally {
+ _lastExecuted = aDate;
+ }
+ }
+ }
+
+ /**
+ * Gets the time the crawler was last executed.
+ * @return Time of last execution.
+ */
+ public Date getLastExecuted() {
+ return _lastExecuted;
+ }
+
+ /**
+ * Gets the result of the last execution.
+ * @return null if the last execution was successful or an exception
+ * otherwise.
+ */
+ public Exception getLastResult() {
+ return _lastResult;
+ }
+
+ /**
+ * Determines whether or not the crawler must be run.
+ * @param aDate Current time.
+ * @return True iff the crawler must be run.
+ */
+ private boolean mustExecute(Date aDate) {
+ Calendar calendar = Calendar.getInstance();
+ calendar.setTime(aDate);
+ int hour = calendar.get(Calendar.HOUR_OF_DAY);
+ if ( hour < _hourMin ) {
+ return false;
+ }
+ if (hour > _hourMax ) {
+ return false;
+ }
+ if ( hour == _hourMin ) {
+ return true; // First execution of today.
+ }
+ if ( _lastResult != null ) {
+ return true; // last execution of today was unsuccessful, retry.
+ }
+
+ return false; // already run successfully today.
+ }
+}
--- /dev/null
+/*
+ * Copyright 2006 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.scheduling.quartz;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.quartz.JobExecutionContext;
+import org.quartz.JobExecutionException;
+import org.quartz.StatefulJob;
+import org.wamblee.crawler.kiss.scheduling.CrawlerSchedule;
+import org.wamblee.general.BeanKernel;
+
+/**
+ * Quartz job to execute the crawler.
+ */
+public class CrawlerJob implements StatefulJob {
+
+ private static final Log LOG = LogFactory.getLog(CrawlerJob.class);
+
+ /**
+ * Constructs the job.
+ *
+ */
+ public CrawlerJob() {
+ // Empty.
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.quartz.Job#execute(org.quartz.JobExecutionContext)
+ */
+ public void execute(JobExecutionContext aContext)
+ throws JobExecutionException {
+ LOG.info("Job triggered");
+ try {
+ CrawlerSchedule schedule = BeanKernel.getBeanFactory().find(
+ CrawlerSchedule.class);
+ schedule.execute(aContext.getFireTime());
+ } catch (Exception e) {
+ throw new JobExecutionException("Error executing crawler", e, false);
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright 2006 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.scheduling.quartz;
+
+import java.util.Date;
+
+import org.quartz.JobDetail;
+import org.quartz.Scheduler;
+import org.quartz.SchedulerException;
+import org.quartz.SchedulerFactory;
+import org.quartz.Trigger;
+import org.quartz.TriggerUtils;
+import org.quartz.impl.StdSchedulerFactory;
+
+/**
+ * Interface to the Quartz scheduler.
+ */
+public class QuartzCrawlerScheduler {
+
+ private Scheduler _scheduler;
+
+ /**
+ * Constructs the quartz interface.
+ * @throws SchedulerException
+ */
+ public QuartzCrawlerScheduler() throws SchedulerException {
+ SchedulerFactory schedulerFactory = new StdSchedulerFactory();
+ _scheduler = schedulerFactory.getScheduler();
+ }
+
+ /**
+ * Initializes the scheduler.
+ * @throws SchedulerException
+ */
+ public void initialize() throws SchedulerException {
+ _scheduler.start();
+
+ JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class);
+ jobDetail.getJobDataMap().put("count", 0);
+
+ Trigger trigger = TriggerUtils.makeHourlyTrigger();
+ trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date()));
+ trigger.setName("hourly");
+
+ _scheduler.scheduleJob(jobDetail, trigger);
+ }
+
+ /**
+ * Shuts down the scheduler.
+ * @throws SchedulerException
+ */
+ public void shutdown() throws SchedulerException {
+ _scheduler.shutdown();
+ }
+}
--- /dev/null
+/*
+ * Copyright 2006 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.servlet;
+
+import java.util.Date;
+
+import javax.servlet.ServletContextEvent;
+import javax.servlet.ServletContextListener;
+
+import org.quartz.JobDetail;
+import org.quartz.Scheduler;
+import org.quartz.SchedulerException;
+import org.quartz.SchedulerFactory;
+import org.quartz.Trigger;
+import org.quartz.TriggerUtils;
+import org.quartz.core.QuartzScheduler;
+import org.quartz.impl.StdSchedulerFactory;
+import org.wamblee.crawler.kiss.scheduling.quartz.CrawlerJob;
+import org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler;
+import org.wamblee.general.BeanKernel;
+
+/**
+ * The mechanism for kick starting the scheduling of the KiSS crawler.
+ */
+public class Application implements ServletContextListener {
+
+ public Application() {
+ // Empty.
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see javax.servlet.ServletContextListener#contextInitialized(javax.servlet.ServletContextEvent)
+ */
+ public void contextInitialized(ServletContextEvent aEvent) {
+ aEvent.getServletContext().log("KiSS Crawler initializing");
+ try {
+ getScheduler().initialize();
+ } catch (SchedulerException e) {
+ aEvent.getServletContext().log("Error scheduling job", e);
+ }
+ aEvent.getServletContext().log("KiSS Crawler initialized");
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see javax.servlet.ServletContextListener#contextDestroyed(javax.servlet.ServletContextEvent)
+ */
+ public void contextDestroyed(ServletContextEvent aEvent) {
+ aEvent.getServletContext().log("KiSS Crawler shutting down");
+ try {
+ getScheduler().shutdown();
+ } catch (SchedulerException e) {
+ aEvent.getServletContext().log("Error scheduling job", e);
+ }
+ aEvent.getServletContext().log("KiSS Crawler shut down complete");
+ }
+
+ private QuartzCrawlerScheduler getScheduler() {
+ return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class);
+ }
+
+ public static void main(String[] aArgs) throws SchedulerException {
+ Application application = new Application();
+ application.getScheduler().initialize();
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wamblee.crawler.kiss.spring;
+import org.wamblee.general.SpringBeanFactory;
+
+
+/**
+ * Bean factory for the crawler application.
+ */
+public class CrawlerBeanFactory extends SpringBeanFactory {
+ private static final String SELECTOR_NAME = "beanRefContext.xml";
+ private static final String FACTORY_NAME = "crawler";
+
+ /**
+ * Constructs the bean factory.
+ *
+ */
+ public CrawlerBeanFactory() {
+ super(SELECTOR_NAME, FACTORY_NAME);
+ }
+}