(no commit message)
authorerik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Mon, 24 Apr 2006 20:27:06 +0000 (20:27 +0000)
committererik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Mon, 24 Apr 2006 20:27:06 +0000 (20:27 +0000)
13 files changed:
crawler/kissweb/WebRoot/META-INF/MANIFEST.MF [new file with mode: 0644]
crawler/kissweb/WebRoot/WEB-INF/web.xml [new file with mode: 0644]
crawler/kissweb/build.xml [new file with mode: 0644]
crawler/kissweb/src/beanRefContext.xml [new file with mode: 0644]
crawler/kissweb/src/org.wamblee.beanfactory.properties [new file with mode: 0644]
crawler/kissweb/src/org.wamblee.crawler.kiss.xml [new file with mode: 0644]
crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java [new file with mode: 0644]
crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutorImpl.java [new file with mode: 0644]
crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java [new file with mode: 0644]
crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/CrawlerJob.java [new file with mode: 0644]
crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java [new file with mode: 0644]
crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java [new file with mode: 0644]
crawler/kissweb/src/org/wamblee/crawler/kiss/spring/CrawlerBeanFactory.java [new file with mode: 0644]

diff --git a/crawler/kissweb/WebRoot/META-INF/MANIFEST.MF b/crawler/kissweb/WebRoot/META-INF/MANIFEST.MF
new file mode 100644 (file)
index 0000000..5e94951
--- /dev/null
@@ -0,0 +1,3 @@
+Manifest-Version: 1.0\r
+Class-Path: \r
+\r
diff --git a/crawler/kissweb/WebRoot/WEB-INF/web.xml b/crawler/kissweb/WebRoot/WEB-INF/web.xml
new file mode 100644 (file)
index 0000000..3239a34
--- /dev/null
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<web-app version="2.4" 
+       xmlns="http://java.sun.com/xml/ns/j2ee" 
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+       xsi:schemaLocation="http://java.sun.com/xml/ns/j2ee 
+       http://java.sun.com/xml/ns/j2ee/web-app_2_4.xsd">
+
+    <listener>
+        <listener-class>org.wamblee.crawler.kiss.servlet.Application</listener-class>
+    </listener>
+</web-app>
diff --git a/crawler/kissweb/build.xml b/crawler/kissweb/build.xml
new file mode 100644 (file)
index 0000000..aa400c7
--- /dev/null
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+
+<!DOCTYPE project [
+    <!ENTITY header SYSTEM "file:../../build/header.xml">
+    <!ENTITY trailer SYSTEM "file:../../build/trailer.xml">
+       <!ENTITY crawlerdeps SYSTEM "file:../basic/deps.xml">
+       <!ENTITY kisscrawlerdeps SYSTEM "file:../kiss/deps.xml">
+]>
+
+<project name="crawler" default="jar" basedir=".">
+
+
+       <!-- =============================================================================== -->
+       <!-- Include the build header defining general properties                            -->
+       <!-- =============================================================================== -->
+    <property name="project.home" value="../.."/>
+    <property name="module.name" value="wamblee-crawler-kissweb" />
+       <property name="webroot.dir" value="WebRoot"/>
+
+   &header;
+   &crawlerdeps;
+   &kisscrawlerdeps;
+       
+       <target name="module.build.deps" 
+         depends="kisscrawler.src.d,servletapi.d,wamblee.kisscrawler.d,quartz.d,spring.d">
+       </target>
+       
+       <!-- Set libraries to use in addition for test, a library which 
+                            is already mentioned in module.build.path should not be 
+                            mentioned below again --> 
+       <target name="module.test.deps" depends="kisscrawler.test.d,wamblee.kisscrawler.test.d">
+       </target>
+                       
+  &trailer; 
+  
+  
+</project>
diff --git a/crawler/kissweb/src/beanRefContext.xml b/crawler/kissweb/src/beanRefContext.xml
new file mode 100644 (file)
index 0000000..0881652
--- /dev/null
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd">
+ <beans> 
+     
+     <bean id="crawler"
+         class="org.springframework.context.support.ClassPathXmlApplicationContext">
+         <constructor-arg>
+             <list>
+                 <value>org.wamblee.crawler.kiss.xml</value>
+             </list>
+         </constructor-arg>
+     </bean>
+     
+ </beans>
\ No newline at end of file
diff --git a/crawler/kissweb/src/org.wamblee.beanfactory.properties b/crawler/kissweb/src/org.wamblee.beanfactory.properties
new file mode 100644 (file)
index 0000000..563a09c
--- /dev/null
@@ -0,0 +1,7 @@
+
+##############################################################################
+# Class name of the beanfactory used by the photos application
+##############################################################################
+
+org.wamblee.beanfactory.class=org.wamblee.crawler.kiss.spring.CrawlerBeanFactory
+
diff --git a/crawler/kissweb/src/org.wamblee.crawler.kiss.xml b/crawler/kissweb/src/org.wamblee.crawler.kiss.xml
new file mode 100644 (file)
index 0000000..79bf241
--- /dev/null
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd">
+
+<beans>
+  <bean id="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler"
+      class="org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler">
+  </bean>   
+  
+  <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"
+      class="org.wamblee.crawler.kiss.scheduling.CrawlerExecutorImpl">
+      <constructor-arg><value>path/to/config.xml</value></constructor-arg>
+      <constructor-arg><value>path/to/programs.xml</value></constructor-arg>
+  </bean>
+  
+  <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule"
+      class="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule">
+      <constructor-arg><ref local="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"/></constructor-arg>
+      <constructor-arg><value type="int">5</value></constructor-arg> <!-- from 5 AM --> 
+      <constructor-arg><value type="int">16</value></constructor-arg> <!-- to 4 PM --> 
+  </bean>
+</beans> 
\ No newline at end of file
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutor.java
new file mode 100644 (file)
index 0000000..e07d90a
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2006 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+
+package org.wamblee.crawler.kiss.scheduling;
+
+import java.util.Date;
+
+/**
+ * Encapsulates the actual execution of the crawler.
+ * This interface makes it possible to test the scheduling logic
+ * in isolation.
+ *
+ */
+public interface CrawlerExecutor { 
+    void execute(Date aDate) throws Exception;
+}
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutorImpl.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerExecutorImpl.java
new file mode 100644 (file)
index 0000000..35d3f92
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2006 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+
+package org.wamblee.crawler.kiss.scheduling;
+
+import java.util.Date;
+
+import org.wamblee.crawler.kiss.main.KissCrawler;
+
+/**
+ * Implementation which executes the KiSS crawler for retrieving web content. 
+ */
+public class CrawlerExecutorImpl implements CrawlerExecutor {
+    
+    private String _crawlerConfig; 
+    private String _programConfig; 
+    
+    /**
+     * Constructs the crawler executor. 
+     * @param aCrawlerConfig Crawler configuration file. 
+     * @param aProgramConfig Program configuration file. 
+     */
+    public CrawlerExecutorImpl(String aCrawlerConfig, String aProgramConfig) { 
+        _crawlerConfig = aCrawlerConfig; 
+        _programConfig = aProgramConfig; 
+    }
+
+    /* (non-Javadoc)
+     * @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler.CrawlerExecutor#execute(java.util.Date)
+     */
+    public void execute(Date aDate) throws Exception {
+        KissCrawler crawler = new KissCrawler(_crawlerConfig, _programConfig);       
+    }
+}
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/CrawlerSchedule.java
new file mode 100644 (file)
index 0000000..09ada84
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2006 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+
+package org.wamblee.crawler.kiss.scheduling;
+
+import java.io.Serializable;
+import java.util.Calendar;
+import java.util.Date;
+
+/**
+ * This class encapsulates the logic for deciding whether to
+ * run the crawler. This provides the mechanism to keep the 
+ * scheduler simple (e.g. scheduling every hour) and providing
+ * more complex logic for determining whether to run the 
+ * crawler. 
+ */
+public class CrawlerSchedule implements Serializable {
+    
+    private CrawlerExecutor _crawler; 
+    private Date _lastExecuted;
+    private Exception _lastResult;
+    private int _hourMin; 
+    private int _hourMax;
+    
+    /**
+     * Constructs the scheduler.
+     * @param aCrawler The interface by which the crawler is executed.
+     * @param aHourMin The crawler may only run if hour &gt;= <code>aHourMin</code>
+     * @param aHourMax The crawler may only run if hour &lt;= <code>aHourMax</code>
+     */
+    public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) { 
+        _crawler = aCrawler; 
+        _lastExecuted = null; 
+        _lastResult = null; 
+        _hourMin = aHourMin;
+        _hourMax = aHourMax;
+    }
+    
+    /**
+     * Called by a scheduled job. This determines whether the crawler must be run or 
+     * not. This encapsulates the rukes for retrying and scheduling the crawler. 
+     * @param aDate Time at which we are executing now. 
+     */
+    public void execute(Date aDate) { 
+        if (mustExecute(aDate)) { 
+            try { 
+                _lastResult = null; 
+                _crawler.execute(aDate);
+            } catch (Exception e) {
+                _lastResult = e; 
+            } finally { 
+                _lastExecuted = aDate;
+            }
+        }
+    }
+    
+    /**
+     * Gets the time the crawler was last executed.
+     * @return Time of last execution. 
+     */
+    public Date getLastExecuted() { 
+        return _lastExecuted; 
+    }
+    
+    /**
+     * Gets the result of the last execution. 
+     * @return null if the last execution was successful or an exception 
+     *   otherwise.
+     */
+    public Exception getLastResult() { 
+        return _lastResult; 
+    }
+    
+    /**
+     * Determines whether or not the crawler must be run. 
+     * @param aDate Current time. 
+     * @return True iff the crawler must be run. 
+     */
+    private boolean mustExecute(Date aDate) {
+        Calendar calendar = Calendar.getInstance();
+        calendar.setTime(aDate);
+        int hour = calendar.get(Calendar.HOUR_OF_DAY);
+        if ( hour < _hourMin ) {
+            return false;
+        }
+        if (hour > _hourMax ) { 
+            return false; 
+        }
+        if ( hour == _hourMin ) { 
+            return true; // First execution of today. 
+        }
+        if ( _lastResult != null ) { 
+            return true; // last execution of today was unsuccessful, retry. 
+        }
+        
+        return false; // already run successfully today.
+    }
+}
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/CrawlerJob.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/CrawlerJob.java
new file mode 100644 (file)
index 0000000..b86f5a9
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2006 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.scheduling.quartz;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.quartz.JobExecutionContext;
+import org.quartz.JobExecutionException;
+import org.quartz.StatefulJob;
+import org.wamblee.crawler.kiss.scheduling.CrawlerSchedule;
+import org.wamblee.general.BeanKernel;
+
+/**
+ * Quartz job to execute the crawler.  
+ */
+public class CrawlerJob implements StatefulJob {
+
+    private static final Log LOG = LogFactory.getLog(CrawlerJob.class);
+
+    /**
+     * Constructs the job.
+     *
+     */
+    public CrawlerJob() {
+        // Empty.
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.quartz.Job#execute(org.quartz.JobExecutionContext)
+     */
+    public void execute(JobExecutionContext aContext)
+            throws JobExecutionException {
+        LOG.info("Job triggered");
+        try {
+            CrawlerSchedule schedule = BeanKernel.getBeanFactory().find(
+                    CrawlerSchedule.class);
+            schedule.execute(aContext.getFireTime());
+        } catch (Exception e) {
+            throw new JobExecutionException("Error executing crawler", e, false);
+        }
+    }
+}
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/scheduling/quartz/QuartzCrawlerScheduler.java
new file mode 100644 (file)
index 0000000..9458d1c
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2006 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+
+package org.wamblee.crawler.kiss.scheduling.quartz;
+
+import java.util.Date;
+
+import org.quartz.JobDetail;
+import org.quartz.Scheduler;
+import org.quartz.SchedulerException;
+import org.quartz.SchedulerFactory;
+import org.quartz.Trigger;
+import org.quartz.TriggerUtils;
+import org.quartz.impl.StdSchedulerFactory;
+
+/**
+ * Interface to the Quartz scheduler.
+ */
+public class QuartzCrawlerScheduler {
+    
+    private Scheduler _scheduler; 
+
+    /**
+     * Constructs the quartz interface. 
+     * @throws SchedulerException
+     */
+    public QuartzCrawlerScheduler() throws SchedulerException { 
+        SchedulerFactory schedulerFactory = new StdSchedulerFactory();
+        _scheduler = schedulerFactory.getScheduler();
+    }
+    
+    /**
+     * Initializes the scheduler.
+     * @throws SchedulerException
+     */
+    public void initialize() throws SchedulerException { 
+        _scheduler.start();
+
+        JobDetail jobDetail = new JobDetail("kisscrawler", null, CrawlerJob.class);
+        jobDetail.getJobDataMap().put("count", 0);
+
+        Trigger trigger = TriggerUtils.makeHourlyTrigger(); 
+        trigger.setStartTime(TriggerUtils.getEvenHourDate(new Date())); 
+        trigger.setName("hourly");
+
+        _scheduler.scheduleJob(jobDetail, trigger);
+    }
+    
+    /**
+     * Shuts down the scheduler. 
+     * @throws SchedulerException
+     */
+    public void shutdown() throws SchedulerException { 
+        _scheduler.shutdown();
+    }
+}
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/servlet/Application.java
new file mode 100644 (file)
index 0000000..a4e78f7
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2006 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.servlet;
+
+import java.util.Date;
+
+import javax.servlet.ServletContextEvent;
+import javax.servlet.ServletContextListener;
+
+import org.quartz.JobDetail;
+import org.quartz.Scheduler;
+import org.quartz.SchedulerException;
+import org.quartz.SchedulerFactory;
+import org.quartz.Trigger;
+import org.quartz.TriggerUtils;
+import org.quartz.core.QuartzScheduler;
+import org.quartz.impl.StdSchedulerFactory;
+import org.wamblee.crawler.kiss.scheduling.quartz.CrawlerJob;
+import org.wamblee.crawler.kiss.scheduling.quartz.QuartzCrawlerScheduler;
+import org.wamblee.general.BeanKernel;
+
+/**
+ * The mechanism for kick starting the scheduling of the KiSS crawler. 
+ */
+public class Application implements ServletContextListener {
+    
+    public Application() { 
+        // Empty. 
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see javax.servlet.ServletContextListener#contextInitialized(javax.servlet.ServletContextEvent)
+     */
+    public void contextInitialized(ServletContextEvent aEvent) {
+        aEvent.getServletContext().log("KiSS Crawler initializing");
+        try {
+            getScheduler().initialize();
+        } catch (SchedulerException e) {
+            aEvent.getServletContext().log("Error scheduling job", e);
+        }
+        aEvent.getServletContext().log("KiSS Crawler initialized");
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see javax.servlet.ServletContextListener#contextDestroyed(javax.servlet.ServletContextEvent)
+     */
+    public void contextDestroyed(ServletContextEvent aEvent) {
+        aEvent.getServletContext().log("KiSS Crawler shutting down");
+        try {
+            getScheduler().shutdown();
+        } catch (SchedulerException e) {
+            aEvent.getServletContext().log("Error scheduling job", e);
+        }
+        aEvent.getServletContext().log("KiSS Crawler shut down complete");
+    }
+
+    private QuartzCrawlerScheduler getScheduler() { 
+        return BeanKernel.getBeanFactory().find(QuartzCrawlerScheduler.class);
+    }
+
+    public static void main(String[] aArgs) throws SchedulerException {
+        Application application = new Application();
+        application.getScheduler().initialize();
+    }
+}
diff --git a/crawler/kissweb/src/org/wamblee/crawler/kiss/spring/CrawlerBeanFactory.java b/crawler/kissweb/src/org/wamblee/crawler/kiss/spring/CrawlerBeanFactory.java
new file mode 100644 (file)
index 0000000..e101b9e
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2005 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+package org.wamblee.crawler.kiss.spring;
+import org.wamblee.general.SpringBeanFactory;
+
+
+/**
+ * Bean factory for the crawler application. 
+ */
+public class CrawlerBeanFactory extends SpringBeanFactory {
+    private static final String SELECTOR_NAME = "beanRefContext.xml";
+    private static final String FACTORY_NAME = "crawler";
+    
+    /**
+     * Constructs the bean factory. 
+     *
+     */
+    public CrawlerBeanFactory() {
+        super(SELECTOR_NAME, FACTORY_NAME);
+    }
+}