<fileset dir="conf/kiss">
<include name="config.xml.example"/>
<include name="programs.xml"/>
+ <include name="notification.xml"/>
</fileset>
</copy>
<program>
<priority>9</priority>
- <match>dr.*who</match>
+ <match>((dr)|(doct.*)).*who</match>
</program>
<program>
#!/bin/ksh
-cd $( dirname $0 )
+cd $( dirname $0 )/../conf
CP=""
for i in ../lib/*.jar
set -x
java -classpath $CP org.wamblee.crawler.kiss.main.KissCrawler \
- ../conf/config.xml ../conf/programs.xml
+ config.xml programs.xml
<!-- dependencies of the kiss crawler itself -->
<target name="kisscrawler.src.d"
- depends="logging.d,mail.d,commons-email.d,commons-beanutils.d,commons-codec.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d">
+ depends="logging.d,mail.d,commons-email.d,commons-beanutils.d,commons-codec.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d,spring.d">
</target>
<target name="kisscrawler.test.d" depends="wamblee.support.test.d,wamblee.crawler.test.d">
import org.wamblee.crawler.kiss.guide.TimeInterval;
import org.wamblee.crawler.kiss.notification.NotificationException;
import org.wamblee.crawler.kiss.notification.Notifier;
+import org.wamblee.general.BeanFactory;
import org.wamblee.xml.ClasspathUriResolver;
import org.wamblee.xml.XslTransformer;
public static void main(String[] aArgs) throws Exception {
String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
String programConfig = new File(aArgs[1]).getCanonicalPath();
- new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig);
+
+ BeanFactory factory = new StandaloneCrawlerBeanFactory();
+ Notifier notifier = factory.find(Notifier.class);
+ new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig, notifier, new Report());
}
/**
* Configuration file for the crawler.
* @param aProgramConfig
* Configuration file describing interesting shows.
+ * @param aNotifier Object used to send notifications of the results.
+ * @param aReport Report to use.
* @throws IOException
* In case of problems reading files.
- * @throws MessagingException
- * In case of problems sending a mail notification.
+ * @throws NotificationException In case notification fails.
+ * @throws PageException In case of problems retrieving the TV guide.
*/
public KissCrawler(String aCrawlerConfig,
- String aProgramConfig) throws IOException, NotificationException, PageException {
- this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig);
+ String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException {
+ this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, aNotifier, aReport);
}
* Configuration file for the crawler.
* @param aProgramConfig
* Configuration file describing interesting shows.
+ * @param aNotifier Object used to send notifications of the results.
+ * @param aReport Report to use.
* @throws IOException
* In case of problems reading files.
- * @throws MessagingException
- * In case of problems sending a mail notification.
+ * @throws NotificationException In case notification fails.
+ * @throws PageException In case of problems retrieving the TV guide.
*/
public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig,
- String aProgramConfig) throws IOException, NotificationException, PageException {
+ String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException {
_pattern = Pattern.compile(TIME_REGEX);
parser.parse(programConfigFile);
List<ProgramFilter> programFilters = parser.getFilters();
- Report report = new Report();
-
try {
- Page page = getStartPage(aStartUrl, crawler, report);
- TVGuide guide = createGuide(page, report);
+ Page page = getStartPage(aStartUrl, crawler, aReport);
+ TVGuide guide = createGuide(page, aReport);
PrintVisitor printer = new PrintVisitor(System.out);
guide.accept(printer);
- processResults(programFilters, guide, parser.getNotifier(),
- report);
+ processResults(programFilters, guide, aNotifier,
+ aReport);
} catch (PageException e) {
- report.addMessage("Problem getting TV guide", e);
+ aReport.addMessage("Problem getting TV guide", e);
LOG.info("Problem getting TV guide", e);
throw e;
}
- parser.getNotifier().send(report.asXml());
+ aNotifier.send(aReport.asXml());
} finally {
System.out.println("Crawler finished");
}
/**
* Constructs the program action executor.
- *
+ *
+ * @param aReport Report to use.
*/
public ProgramActionExecutor(Report aReport) {
_showsToRecord = new TreeMap<Integer, Set<Program>>();
import org.wamblee.conditions.Condition;
import org.wamblee.conditions.PropertyRegexCondition;
import org.wamblee.crawler.kiss.guide.Program;
-import org.wamblee.crawler.kiss.notification.MailNotifier;
-import org.wamblee.crawler.kiss.notification.MailServer;
-import org.wamblee.crawler.kiss.notification.Notifier;
import org.wamblee.xml.XslTransformer;
/**
private static final int DEFAULT_PRIORITY = 1;
- private static final String ELEM_PASSWORD = "password";
-
- private static final String ELEM_USERNAME = "username";
-
- private static final String ELEM_PORT = "port";
-
- private static final String ELEM_HOST = "host";
-
// Formatting configuration.
private static final String ELEM_FORMAT = "format";
private static final String ELEM_HTML = "html";
- // Mail server configuration.
-
- private static final String ELEM_NOTIFICATION = "notification";
-
- private static final String ELEM_SMTP = "smtp";
-
- private static final String ELEM_SUBJECT = "subject";
-
- private static final String ELEM_TO = "to";
-
- private static final String ELEM_FROM = "from";
-
// Configuration of interesting programs.
private static final String ELEM_PROGRAM = "program";
private XslTransformer _transformer;
- private Notifier _notifier;
-
ProgramConfigurationParser(XslTransformer aTransformer) {
_filters = null;
- _notifier = null;
_transformer = aTransformer;
}
filters.add(new ProgramFilter(condition, action));
}
_filters = filters;
-
- Element notifier = root.element(ELEM_NOTIFICATION);
- _notifier = parseNotifier(notifier);
-
} catch (DocumentException e) {
throw new RuntimeException("Error parsing program configuraiton", e);
}
}
- /**
- * Parses the notifier
- *
- * @return Notifier
- */
- private Notifier parseNotifier(Element aNotifier) {
- String from = aNotifier.elementTextTrim(ELEM_FROM);
- String to = aNotifier.elementTextTrim(ELEM_TO);
- String subject = aNotifier.elementTextTrim(ELEM_SUBJECT);
-
- Element smtp = aNotifier.element(ELEM_SMTP);
- MailServer server = parseMailServer(smtp);
-
- Element format = aNotifier.element(ELEM_FORMAT);
- String htmlXslt = format.elementTextTrim(ELEM_HTML);
- String textXslt = format.elementTextTrim(ELEM_TEXT);
-
- return new MailNotifier(from, to, subject, htmlXslt, textXslt, server, _transformer);
- }
-
- /**
- * Parses the mail server from the XML.
- *
- * @param aSmtp
- * Mail server configuration.
- * @return Mail server.
- */
- private MailServer parseMailServer(Element aSmtp) {
- String host = aSmtp.elementTextTrim(ELEM_HOST);
- Element portElem = aSmtp.element(ELEM_PORT);
- int port = DEFAULT_SMTP_PORT;
- if (portElem != null) {
- port = Integer.valueOf(portElem.getTextTrim());
- }
- String username = aSmtp.elementTextTrim(ELEM_USERNAME);
- String password = aSmtp.elementTextTrim(ELEM_PASSWORD);
-
- return new MailServer(host, port, username, password);
- }
-
/**
* Returns the list of program filters.
*
public List<ProgramFilter> getFilters() {
return _filters;
}
-
- /**
- * Returns the notifier to use.
- *
- * @return Notifier.
- */
- public Notifier getNotifier() {
- return _notifier;
- }
}
String reportXmlText = aReport.asXML();
return _transformer.textTransform(reportXmlText.getBytes(), _transformer.resolve(aXslt));
}
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.notification.Notifier#asHtml(org.dom4j.Element)
+ */
+ public String asHtml(Element aReport) throws IOException, TransformerException {
+ return transformReport(aReport, _htmlXslt);
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.notification.Notifier#asText(org.dom4j.Element)
+ */
+ public String asText(Element aReport) throws IOException, TransformerException {
+ return transformReport(aReport, _textXslt);
+ }
}
*/
package org.wamblee.crawler.kiss.notification;
+import java.io.IOException;
+
+import javax.xml.transform.TransformerException;
+
import org.dom4j.Element;
/**
* Report to send.
*/
void send(Element aReport) throws NotificationException;
+
+ /**
+ * Converts the report to html.
+ * @param aReport Report to convert.
+ * @return
+ */
+ String asHtml(Element aReport) throws IOException, TransformerException;
+
+ /**
+ * Converts the report to text.
+ * @param aReport Report to convert.
+ * @return
+ */
+ String asText(Element aReport) throws IOException, TransformerException;
}
<c:out value="${running}"/>
</td>
</tr>
- <tr>
+ <c:if test="${lastReport != null}">
+ <tr>
<td>
Last executed at:
</td>
<c:out value="${lastResult}"/>
</td>
</tr>
- <tr>
+ <tr>
<td>
Last message:
</td>
<td>
<c:out value="${lastMessage}" escapeXml="false"/>
</td>
- </tr>
+ </tr>
+ <tr>
+ <td>
+ Last report:
+ </td>
+ <td>
+ <a href="?details=1">details</a>
+ </td>
+ </tr>
+ </c:if>
</TABLE>
<c:if test="${!running}">
<FORM action="runnow">
class="org.springframework.context.support.ClassPathXmlApplicationContext">
<constructor-arg>
<list>
+ <value>org.wamblee.crawler.notification.xml</value>
<value>org.wamblee.crawler.kiss.xml</value>
</list>
</constructor-arg>
class="org.wamblee.crawler.kiss.scheduling.CrawlerExecutorImpl">
<constructor-arg><value>/home/erik/crawler/config.xml</value></constructor-arg>
<constructor-arg><value>/home/erik/crawler/programs.xml</value></constructor-arg>
+ <constructor-arg><ref bean="org.wamblee.crawler.kiss.notification.Notifier"/></constructor-arg>
</bean>
<!-- The object that determines whether to execute the crawler when it is signalled by
the scheduler. -->
- <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule"
- class="org.wamblee.crawler.kiss.scheduling.CrawlerSchedule">
+ <bean id="org.wamblee.crawler.kiss.scheduling.CrawlerStatus"
+ class="org.wamblee.crawler.kiss.scheduling.CrawlerStatus">
<constructor-arg><ref local="org.wamblee.crawler.kiss.scheduling.CrawlerExecutor"/></constructor-arg>
<!-- The interval of the day in hours [hourmin, hourmax] over which crawling will be done and
retried if necessary -->
import java.util.Date;
+import org.wamblee.crawler.kiss.main.Report;
+
/**
* Encapsulates the actual execution of the crawler.
* This interface makes it possible to test the scheduling logic
/**
* Executes the crawler.
- * @param aDate Date the crawler is being triggered.
+ * @param aDate Date the crawler is being triggered.
+ * @param The report from the crawler.
* @throws Exception
*/
- void execute(Date aDate) throws Exception;
+ void execute(Date aDate, Report aReport) throws Exception;
}
import java.util.Date;
import org.wamblee.crawler.kiss.main.KissCrawler;
+import org.wamblee.crawler.kiss.main.Report;
+import org.wamblee.crawler.kiss.notification.Notifier;
/**
* Implementation which executes the KiSS crawler for retrieving web content.
public class CrawlerExecutorImpl implements CrawlerExecutor {
private String _crawlerConfig;
- private String _programConfig;
-
+ private String _programConfig;
+ private Notifier _notifier;
+
/**
* Constructs the crawler executor.
* @param aCrawlerConfig Crawler configuration file.
- * @param aProgramConfig Program configuration file.
+ * @param aProgramConfig Program configuration file.
+ * @param aNotifier Object used to send notifications.
*/
- public CrawlerExecutorImpl(String aCrawlerConfig, String aProgramConfig) {
+ public CrawlerExecutorImpl(String aCrawlerConfig, String aProgramConfig, Notifier aNotifier) {
_crawlerConfig = aCrawlerConfig;
- _programConfig = aProgramConfig;
+ _programConfig = aProgramConfig;
+ _notifier = aNotifier;
}
/* (non-Javadoc)
* @see org.wamblee.crawler.kiss.scheduling.CrawlerScheduler.CrawlerExecutor#execute(java.util.Date)
*/
- public void execute(Date aDate) throws Exception {
- KissCrawler crawler = new KissCrawler(_crawlerConfig, _programConfig);
+ public void execute(Date aDate, Report aReport) throws Exception {
+ KissCrawler crawler = new KissCrawler(_crawlerConfig, _programConfig, _notifier, aReport);
}
}
package org.wamblee.crawler.kiss.scheduling;
/**
- *
+ * Interface to the scheduler specific for working with the crawler.
*/
public interface CrawlerScheduler {
+ /**
+ * Initializes the scheduler.
+ * @throws Exception In case of problems.
+ */
void initialize() throws Exception;
+ /**
+ * Checks if the crawler is running.
+ * @return True iff the crawler is running.
+ * @throws Exception In case of problems.
+ */
boolean isCrawlerRunning() throws Exception;
-
+
+ /**
+ * Schedules the crawler for immediate execution.
+ * @throws Exception In case of problems.
+ */
void scheduleNow() throws Exception;
+ /**
+ * Shuts down the scheduler.
+ * @throws Exception In case of problems.
+ */
void shutdown() throws Exception;
-
}
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.wamblee.crawler.kiss.main.Report;
/**
* This class encapsulates the logic for deciding whether to
* more complex logic for determining whether to run the
* crawler.
*/
-public class CrawlerSchedule implements Serializable {
+public class CrawlerStatus implements Serializable {
- private static final Log LOG = LogFactory.getLog(CrawlerSchedule.class);
+ private static final Log LOG = LogFactory.getLog(CrawlerStatus.class);
private CrawlerExecutor _crawler;
private Date _lastExecuted;
private boolean _lastResult;
private Exception _lastException;
+ private Report _lastReport;
private int _hourMin;
private int _hourMax;
+ private boolean _mustExecute;
/**
* Constructs the scheduler.
* @param aHourMin The crawler may only run if hour >= <code>aHourMin</code>
* @param aHourMax The crawler may only run if hour <= <code>aHourMax</code>
*/
- public CrawlerSchedule(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
+ public CrawlerStatus(CrawlerExecutor aCrawler, int aHourMin, int aHourMax) {
_crawler = aCrawler;
_lastExecuted = new Date();
_lastResult = true; // the crawler will automatically run the next day.
- _lastException = null;
+ _lastException = null;
+ _lastReport = null;
_hourMin = aHourMin;
_hourMax = aHourMax;
+ _mustExecute = false;
+ }
+
+ /**
+ * Determines whether or not the crawler must be run the next time it is triggered.
+ * @param aMustExecute If true then the crawler will run the next time it is triggered
+ * by the scheduler.
+ */
+ public void setMustExecute(boolean aMustExecute) {
+ _mustExecute = aMustExecute;
}
/**
if (mustExecute(aDate)) {
LOG.info("Executing crawler at " + aDate);
- try {
- _crawler.execute(aDate);
+ Report report = new Report();
+ try {
+ _crawler.execute(aDate, report);
_lastResult = true;
_lastException = null;
} catch (Exception e) {
_lastException = e;
} finally {
_lastExecuted = aDate;
+ _lastReport = report;
}
}
}
return _lastExecuted;
}
- public void setLastExecuted(Date aDate) {
- _lastExecuted = aDate;
- }
-
/**
* Gets the result of the last execution.
* @return True iff last execution was a success.
return _lastException;
}
+ /**
+ * Gets the last report from the scheduler.
+ * @return Report.
+ */
+ public Report getLastReport() {
+ return _lastReport;
+ }
+
/**
* Determines whether or not the crawler must be run.
* @param aDate Current time.
* @return True iff the crawler must be run.
*/
private boolean mustExecute(Date aDate) {
+ if (_mustExecute) {
+ _mustExecute = false;
+ return true;
+ }
if ( _lastExecuted == null ) {
return false; // crawler must be started manually at least once after deployment.
}
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.quartz.StatefulJob;
-import org.wamblee.crawler.kiss.scheduling.CrawlerSchedule;
+import org.wamblee.crawler.kiss.scheduling.CrawlerStatus;
import org.wamblee.general.BeanKernel;
/**
throws JobExecutionException {
LOG.info("Job triggered");
try {
- CrawlerSchedule schedule = BeanKernel.getBeanFactory().find(
- CrawlerSchedule.class);
+ CrawlerStatus schedule = BeanKernel.getBeanFactory().find(
+ CrawlerStatus.class);
schedule.execute(aContext.getFireTime());
} catch (Exception e) {
throw new JobExecutionException("Error executing crawler", e, false);
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
-import org.quartz.SchedulerException;
import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler;
import org.wamblee.general.BeanKernel;
package org.wamblee.crawler.kiss.servlet;
import java.io.IOException;
-import java.util.Date;
+import java.io.OutputStream;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
-import org.wamblee.crawler.kiss.scheduling.CrawlerSchedule;
+import org.wamblee.crawler.kiss.main.Report;
+import org.wamblee.crawler.kiss.notification.Notifier;
import org.wamblee.crawler.kiss.scheduling.CrawlerScheduler;
+import org.wamblee.crawler.kiss.scheduling.CrawlerStatus;
import org.wamblee.general.BeanKernel;
/**
CrawlerScheduler scheduler = BeanKernel.getBeanFactory().find(
CrawlerScheduler.class);
- CrawlerSchedule status = BeanKernel.getBeanFactory().find(
- CrawlerSchedule.class);
+ CrawlerStatus status = BeanKernel.getBeanFactory().find(
+ CrawlerStatus.class);
try {
- if ( aRequest.getParameter("runnow") != null ) {
- status.setLastExecuted(new Date(System.currentTimeMillis() - 24*3600*1000));
+ if (aRequest.getParameter("details") != null) {
+ Report report = status.getLastReport();
+ if (report != null) {
+ Notifier notifier = BeanKernel.getBeanFactory().find(Notifier.class);
+ OutputStream os = aResponse.getOutputStream();
+ os.write(notifier.asHtml(report.asXml()).getBytes());
+ return;
+ }
+ }
+ if (aRequest.getParameter("runnow") != null) {
+ status.setMustExecute(true);
scheduler.scheduleNow();
aResponse.sendRedirect("");
return;
aRequest.setAttribute("lastExecuted", status.getLastExecuted());
aRequest.setAttribute("lastResult", status.getLastResult());
aRequest.setAttribute("lastException", status.getLastException());
- String msg = "";
- Throwable e = status.getLastException();
- while ( e != null ) {
- msg = msg + e.getMessage() + "<br/>";
+ aRequest.setAttribute("lastReport", status.getLastReport());
+ String msg = "";
+ Throwable e = status.getLastException();
+ while (e != null) {
+ msg = msg + e.getClass().getName() + ": " + e.getMessage()
+ + "<br/>";
e = e.getCause();
}
aRequest.setAttribute("lastMessage", msg);