From c53b455e7bd72ad4748848fcbe0ebf079f1db884 Mon Sep 17 00:00:00 2001 From: Erik Brakkee Date: Mon, 24 Apr 2006 20:27:06 +0000 Subject: [PATCH] --- crawler/kissweb/WebRoot/META-INF/MANIFEST.MF | 3 + crawler/kissweb/WebRoot/WEB-INF/web.xml | 11 ++ crawler/kissweb/build.xml | 37 ++++++ crawler/kissweb/src/beanRefContext.xml | 15 +++ .../src/org.wamblee.beanfactory.properties | 7 ++ .../kissweb/src/org.wamblee.crawler.kiss.xml | 22 ++++ .../kiss/scheduling/CrawlerExecutor.java | 29 +++++ .../kiss/scheduling/CrawlerExecutorImpl.java | 47 ++++++++ .../kiss/scheduling/CrawlerSchedule.java | 111 ++++++++++++++++++ .../kiss/scheduling/quartz/CrawlerJob.java | 58 +++++++++ .../quartz/QuartzCrawlerScheduler.java | 69 +++++++++++ .../crawler/kiss/servlet/Application.java | 83 +++++++++++++ .../kiss/spring/CrawlerBeanFactory.java | 34 ++++++ 13 files changed, 526 insertions(+) create mode 100644 crawler/kissweb/WebRoot/META-INF/MANIFEST.MF create mode 100644 crawler/kissweb/WebRoot/WEB-INF/web.xml create mode 100644 crawler/kissweb/build.xml create mode 100644 crawler/kissweb/src/beanRefContext.xml create mode 100644 crawler/kissweb/src/org.wamblee.beanfactory.properties create mode 100644 crawler/kissweb/src/org.wamblee.crawler.kiss.xml create mode 100644 crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java create mode 100644 crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutorImpl.java create mode 100644 crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java create mode 100644 crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/CrawlerJob.java create mode 100644 crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java create mode 100644 crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java create mode 100644 crawler/kissweb/src/org/wamblee/crawler/kiss/spring/CrawlerBeanFactory.java diff --git a/crawler/kissweb/WebRoot/META-INF/MANIFEST.MF b/crawler/kissweb/WebRoot/META-INF/MANIFEST.MF new file mode 100644 index 00000000..5e949512 --- /dev/null +++ b/crawler/kissweb/WebRoot/META-INF/MANIFEST.MF @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Class-Path: + diff --git a/crawler/kissweb/WebRoot/WEB-INF/web.xml b/crawler/kissweb/WebRoot/WEB-INF/web.xml new file mode 100644 index 00000000..3239a34a --- /dev/null +++ b/crawler/kissweb/WebRoot/WEB-INF/web.xml @@ -0,0 +1,11 @@ + + + + + org.wamblee.crawler.kiss.servlet.Application + + diff --git a/crawler/kissweb/build.xml b/crawler/kissweb/build.xml new file mode 100644 index 00000000..aa400c7b --- /dev/null +++ b/crawler/kissweb/build.xml @@ -0,0 +1,37 @@ + + + + + + +]> + + + + + + + + + + + + &header; + &crawlerdeps; + &kisscrawlerdeps; + + + + + + + + + &trailer; + + + diff --git a/crawler/kissweb/src/beanRefContext.xml b/crawler/kissweb/src/beanRefContext.xml new file mode 100644 index 00000000..0881652f --- /dev/null +++ b/crawler/kissweb/src/beanRefContext.xml @@ -0,0 +1,15 @@ + + + + + + + + + org.wamblee.crawler.kiss.xml + + + + + \ No newline at end of file diff --git a/crawler/kissweb/src/org.wamblee.beanfactory.properties b/crawler/kissweb/src/org.wamblee.beanfactory.properties new file mode 100644 index 00000000..563a09c9 --- /dev/null +++ b/crawler/kissweb/src/org.wamblee.beanfactory.properties @@ -0,0 +1,7 @@ + +############################################################################## +# Class name of the beanfactory used by the photos application +############################################################################## + +org.wamblee.beanfactory.class=org.wamblee.crawler.kiss.spring.CrawlerBeanFactory + diff --git a/crawler/kissweb/src/org.wamblee.crawler.kiss.xml b/crawler/kissweb/src/org.wamblee.crawler.kiss.xml new file mode 100644 index 00000000..79bf2413 --- /dev/null +++ b/crawler/kissweb/src/org.wamblee.crawler.kiss.xml @@ -0,0 +1,22 @@ + + + + + + + + + + path/to/config.xml + path/to/programs.xml + + + + + 5 + 16 + + \ No newline at end of file diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java new file mode 100644 index 00000000..e07d90a6 --- /dev/null +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java @@ -0,0 +1,29 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.scheduling; + +import java.util.Date; + +/** + * Encapsulates the actual execution of the crawler. + * This interface makes it possible to test the scheduling logic + * in isolation. + * + */ +public interface CrawlerExecutor { + void execute(Date aDate) throws Exception; +} diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutorImpl.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutorImpl.java new file mode 100644 index 00000000..35d3f923 --- /dev/null +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutorImpl.java @@ -0,0 +1,47 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.scheduling; + +import java.util.Date; + +import org.wamblee.crawler.kiss.main.KissCrawler; + +/** + * Implementation which executes the KiSS crawler for retrieving web content. + */ +public class CrawlerExecutorImpl implements CrawlerExecutor { + + private String _crawlerConfig; + private String _programConfig; + + /** + * Constructs the crawler executor. + * @param aCrawlerConfig Crawler configuration file. + * @param aProgramConfig Program configuration file. + */ + public CrawlerExecutorImpl(String aCrawlerConfig, String aProgramConfig) { + _crawlerConfig = aCrawlerConfig; + _programConfig = aProgramConfig; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler.CrawlerExecutor#execute(java.util.Date) + */ + public void execute(Date aDate) throws Exception { + KissCrawler crawler = new KissCrawler(_crawlerConfig, _programConfig); + } +} diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java new file mode 100644 index 00000000..09ada84d --- /dev/null +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java @@ -0,0 +1,111 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.scheduling; + +import java.io.Serializable; +import java.util.Calendar; +import java.util.Date; + +/** + * This class encapsulates the logic for deciding whether to + * run the crawler. This provides the mechanism to keep the + * scheduler simple (e.g. scheduling every hour) and providing + * more complex logic for determining whether to run the + * crawler. + */ +public class CrawlerSchedule implements Serializable { + + private CrawlerExecutor _crawler; + private Date _lastExecuted; + private Exception _lastResult; + private int _hourMin; + private int _hourMax; + + /** + * Constructs the scheduler. + * @param aCrawler The interface by which the crawler is executed. + * @param aHourMin The crawler may only run if hour >= aHourMin + * @param aHourMax The crawler may only run if hour <= aHourMax + */ + public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { + _crawler = aCrawler; + _lastExecuted = null; + _lastResult = null; + _hourMin = aHourMin; + _hourMax = aHourMax; + } + + /** + * Called by a scheduled job. This determines whether the crawler must be run or + * not. This encapsulates the rukes for retrying and scheduling the crawler. + * @param aDate Time at which we are executing now. + */ + public void execute(Date aDate) { + if (mustExecute(aDate)) { + try { + _lastResult = null; + _crawler.execute(aDate); + } catch (Exception e) { + _lastResult = e; + } finally { + _lastExecuted = aDate; + } + } + } + + /** + * Gets the time the crawler was last executed. + * @return Time of last execution. + */ + public Date getLastExecuted() { + return _lastExecuted; + } + + /** + * Gets the result of the last execution. + * @return null if the last execution was successful or an exception + * otherwise. + */ + public Exception getLastResult() { + return _lastResult; + } + + /** + * Determines whether or not the crawler must be run. + * @param aDate Current time. + * @return True iff the crawler must be run. + */ + private boolean mustExecute(Date aDate) { + Calendar calendar = Calendar.getInstance(); + calendar.setTime(aDate); + int hour = calendar.get(Calendar.HOUR_OF_DAY); + if ( hour < _hourMin ) { + return false; + } + if (hour > _hourMax ) { + return false; + } + if ( hour == _hourMin ) { + return true; // First execution of today. + } + if ( _lastResult != null ) { + return true; // last execution of today was unsuccessful, retry. + } + + return false; // already run successfully today. + } +} diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/CrawlerJob.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/CrawlerJob.java new file mode 100644 index 00000000..b86f5a92 --- /dev/null +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/CrawlerJob.java @@ -0,0 +1,58 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.scheduling.quartz; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.quartz.JobExecutionContext; +import org.quartz.JobExecutionException; +import org.quartz.StatefulJob; +import org.wamblee.crawler.kiss.scheduling.CrawlerSchedule; +import org.wamblee.general.BeanKernel; + +/** + * Quartz job to execute the crawler. + */ +public class CrawlerJob implements StatefulJob { + + private static final Log LOG = LogFactory.getLog(CrawlerJob.class); + + /** + * Constructs the job. + * + */ + public CrawlerJob() { + // Empty. + } + + /* + * (non-Javadoc) + * + * @see org.quartz.Job#execute(org.quartz.JobExecutionContext) + */ + public void execute(JobExecutionContext aContext) + throws JobExecutionException { + LOG.info("Job triggered"); + try { + CrawlerSchedule schedule = BeanKernel.getBeanFactory().find( + CrawlerSchedule.class); + schedule.execute(aContext.getFireTime()); + } catch (Exception e) { + throw new JobExecutionException("Error executing crawler", e, false); + } + } +} diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java new file mode 100644 index 00000000..9458d1c3 --- /dev/null +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java @@ -0,0 +1,69 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.scheduling.quartz; + +import java.util.Date; + +import org.quartz.JobDetail; +import org.quartz.Scheduler; +import org.quartz.SchedulerException; +import org.quartz.SchedulerFactory; +import org.quartz.Trigger; +import org.quartz.TriggerUtils; +import org.quartz.impl.StdSchedulerFactory; + +/** + * Interface to the Quartz scheduler. + */ +public class QuartzCrawlerScheduler { + + private Scheduler _scheduler; + + /** + * Constructs the quartz interface. + * @throws SchedulerException + */ + public QuartzCrawlerScheduler() throws SchedulerException { + SchedulerFactory schedulerFactory = new StdSchedulerFactory(); + _scheduler = schedulerFactory.getScheduler(); + } + + /** + * Initializes the scheduler. + * @throws SchedulerException + */ + public void initialize() throws SchedulerException { + _scheduler.start(); + + JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class); + jobDetail.getJobDataMap().put("count", 0); + + Trigger trigger = TriggerUtils.makeHourlyTrigger(); + trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date())); + trigger.setName("hourly"); + + _scheduler.scheduleJob(jobDetail, trigger); + } + + /** + * Shuts down the scheduler. + * @throws SchedulerException + */ + public void shutdown() throws SchedulerException { + _scheduler.shutdown(); + } +} diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java new file mode 100644 index 00000000..a4e78f74 --- /dev/null +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java @@ -0,0 +1,83 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.servlet; + +import java.util.Date; + +import javax.servlet.ServletContextEvent; +import javax.servlet.ServletContextListener; + +import org.quartz.JobDetail; +import org.quartz.Scheduler; +import org.quartz.SchedulerException; +import org.quartz.SchedulerFactory; +import org.quartz.Trigger; +import org.quartz.TriggerUtils; +import org.quartz.core.QuartzScheduler; +import org.quartz.impl.StdSchedulerFactory; +import org.wamblee.crawler.kiss.scheduling.quartz.CrawlerJob; +import org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler; +import org.wamblee.general.BeanKernel; + +/** + * The mechanism for kick starting the scheduling of the KiSS crawler. + */ +public class Application implements ServletContextListener { + + public Application() { + // Empty. + } + + /* + * (non-Javadoc) + * + * @see javax.servlet.ServletContextListener#contextInitialized(javax.servlet.ServletContextEvent) + */ + public void contextInitialized(ServletContextEvent aEvent) { + aEvent.getServletContext().log("KiSS Crawler initializing"); + try { + getScheduler().initialize(); + } catch (SchedulerException e) { + aEvent.getServletContext().log("Error scheduling job", e); + } + aEvent.getServletContext().log("KiSS Crawler initialized"); + } + + /* + * (non-Javadoc) + * + * @see javax.servlet.ServletContextListener#contextDestroyed(javax.servlet.ServletContextEvent) + */ + public void contextDestroyed(ServletContextEvent aEvent) { + aEvent.getServletContext().log("KiSS Crawler shutting down"); + try { + getScheduler().shutdown(); + } catch (SchedulerException e) { + aEvent.getServletContext().log("Error scheduling job", e); + } + aEvent.getServletContext().log("KiSS Crawler shut down complete"); + } + + private QuartzCrawlerScheduler getScheduler() { + return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class); + } + + public static void main(String[] aArgs) throws SchedulerException { + Application application = new Application(); + application.getScheduler().initialize(); + } +} diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/spring/CrawlerBeanFactory.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/spring/CrawlerBeanFactory.java new file mode 100644 index 00000000..e101b9e7 --- /dev/null +++ b/crawler/kissweb/src/org/wamblee/crawler/kiss/spring/CrawlerBeanFactory.java @@ -0,0 +1,34 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wamblee.crawler.kiss.spring; +import org.wamblee.general.SpringBeanFactory; + + +/** + * Bean factory for the crawler application. + */ +public class CrawlerBeanFactory extends SpringBeanFactory { + private static final String SELECTOR_NAME = "beanRefContext.xml"; + private static final String FACTORY_NAME = "crawler"; + + /** + * Constructs the bean factory. + * + */ + public CrawlerBeanFactory() { + super(SELECTOR_NAME, FACTORY_NAME); + } +} -- 2.31.1