From 2160337764463b1cd9217671f0f9c2f6ab89dcb0 Mon Sep 17 00:00:00 2001 From: Erik Brakkee Date: Thu, 30 Mar 2006 11:07:31 +0000 Subject: [PATCH] more robustness, now a detailed report is always sent, also if crawling fails. --- .../wamblee/crawler/AbstractPageRequest.java | 2 +- crawler/kiss/conf/kiss/programs.xml | 4 + crawler/kiss/conf/xml/report.xml | 6 + .../crawler/kiss/guide/AbstractVisitor.java | 34 ++-- .../org/wamblee/crawler/kiss/guide/Time.java | 15 +- .../crawler/kiss/main/KissCrawler.java | 84 +++++--- .../kiss/main/ProgramActionExecutor.java | 116 +++-------- .../org/wamblee/crawler/kiss/main/Report.java | 182 ++++++++++++++++++ crawler/kiss/src/reportToHtml.xsl | 13 ++ crawler/kiss/src/reportToText.xsl | 13 ++ 10 files changed, 326 insertions(+), 143 deletions(-) create mode 100644 crawler/kiss/src/org/wamblee/crawler/kiss/main/Report.java diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index c0284d69..432ebb4a 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -252,7 +252,7 @@ public abstract class AbstractPageRequest implements PageRequest { // recursion. } default: { - throw new RuntimeException("Method failed: " + throw new IOException("Method failed: " + aMethod.getStatusLine()); } } diff --git a/crawler/kiss/conf/kiss/programs.xml b/crawler/kiss/conf/kiss/programs.xml index b98ecf63..aa0664c2 100644 --- a/crawler/kiss/conf/kiss/programs.xml +++ b/crawler/kiss/conf/kiss/programs.xml @@ -100,6 +100,10 @@ bedreigde.*paradijzen + + + wie is de baas + wetenschap diff --git a/crawler/kiss/conf/xml/report.xml b/crawler/kiss/conf/xml/report.xml index 391aea94..c6da6fb7 100644 --- a/crawler/kiss/conf/xml/report.xml +++ b/crawler/kiss/conf/xml/report.xml @@ -1,4 +1,10 @@ + + + Hello world! + and another message + + Wintertijd diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/guide/AbstractVisitor.java b/crawler/kiss/src/org/wamblee/crawler/kiss/guide/AbstractVisitor.java index f4b07bff..a6c0d69a 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/guide/AbstractVisitor.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/guide/AbstractVisitor.java @@ -12,41 +12,43 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.kiss.guide; - - /** - * Abstract visitor of the tv guide with default looping behavior. + * Abstract visitor of the tv guide with default looping behavior. */ public abstract class AbstractVisitor implements Visitor { - + /** - * Constructs the visitor. - * + * Constructs the visitor. + * */ - protected AbstractVisitor() { + protected AbstractVisitor() { // Empty } /** - * Visits the channel by visiting all programs of the channel. - * @param aChannel Channel to visit. + * Visits the channel by visiting all programs of the channel. + * + * @param aChannel + * Channel to visit. */ public void visitChannel(Channel aChannel) { - for (Program program: aChannel.getPrograms() ) { - program.accept(this); + for (Program program : aChannel.getPrograms()) { + program.accept(this); } } /** - * Visits the TV guide by visiting all channels of the guide. - * @param aGuide TV guide to visit. + * Visits the TV guide by visiting all channels of the guide. + * + * @param aGuide + * TV guide to visit. */ - public void visitTvGuide(TVGuide aGuide) { - for (Channel channel: aGuide.getChannels()) { + public void visitTvGuide(TVGuide aGuide) { + for (Channel channel : aGuide.getChannels()) { channel.accept(this); } } diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/guide/Time.java b/crawler/kiss/src/org/wamblee/crawler/kiss/guide/Time.java index 2d05bac1..de206f55 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/guide/Time.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/guide/Time.java @@ -98,10 +98,11 @@ public class Time implements Comparable { */ float asFloat() { int hour = _hour; - // Hack to make sure that programs appearing shortly after midnight are sorted - // after those running during the day. - if ( hour <= EARLY_HOUR ) { - hour += HOURS_PER_DAY; + // Hack to make sure that programs appearing shortly after midnight are + // sorted + // after those running during the day. + if (hour <= EARLY_HOUR) { + hour += HOURS_PER_DAY; } return (float) hour + (float) _minute / (float) SECONDS_PER_MINUTE; } @@ -120,8 +121,10 @@ public class Time implements Comparable { } /** - * Compares based on time. - * @param aObject Time object to compare to. + * Compares based on time. + * + * @param aObject + * Time object to compare to. * @return See {@link Comparable#compareTo(T)}. */ public int compareTo(Object aObject) { diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java index ad8de402..0c01a9bc 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java @@ -112,28 +112,39 @@ public class KissCrawler { * In case of problems sending a mail notification. */ public KissCrawler(String aStartUrl, String aCrawlerConfig, - String aProgramConfig) throws IOException, MessagingException { + String aProgramConfig) throws IOException, NotificationException { _pattern = Pattern.compile(TIME_REGEX); try { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("127.0.0.1", 3128); - - XslTransformer transformer = new XslTransformer(new ClasspathUriResolver()); + + XslTransformer transformer = new XslTransformer( + new ClasspathUriResolver()); Crawler crawler = createCrawler(aCrawlerConfig, client, transformer); InputStream programConfigFile = new FileInputStream(new File( aProgramConfig)); - ProgramConfigurationParser parser = new ProgramConfigurationParser(transformer); + ProgramConfigurationParser parser = new ProgramConfigurationParser( + transformer); parser.parse(programConfigFile); List programFilters = parser.getFilters(); - Page page = getStartPage(aStartUrl, crawler); - TVGuide guide = createGuide(page); - PrintVisitor printer = new PrintVisitor(System.out); - guide.accept(printer); - processResults(programFilters, guide, parser.getNotifier()); + Report report = new Report(); + + try { + Page page = getStartPage(aStartUrl, crawler, report); + TVGuide guide = createGuide(page, report); + PrintVisitor printer = new PrintVisitor(System.out); + guide.accept(printer); + processResults(programFilters, guide, parser.getNotifier(), + report); + } catch (PageException e) { + report.addMessage("Problem getting TV guide", e); + LOG.info("Problem getting TV guide", e); + } + parser.getNotifier().send(report.asXml()); } finally { System.out.println("Crawler finished"); } @@ -150,8 +161,8 @@ public class KissCrawler { * In case of problems sending a summary mail. */ private void processResults(List aProgramCondition, - TVGuide aGuide, Notifier aNotifier) throws MessagingException { - ProgramActionExecutor executor = new ProgramActionExecutor(); + TVGuide aGuide, Notifier aNotifier, Report aReport) { + ProgramActionExecutor executor = new ProgramActionExecutor(aReport); for (ProgramFilter filter : aProgramCondition) { List programs = filter.apply(aGuide); ProgramAction action = filter.getAction(); @@ -160,11 +171,7 @@ public class KissCrawler { } } executor.commit(); - try { - aNotifier.send(executor.getReport()); - } catch (NotificationException e) { - throw new RuntimeException(e); - } + } /** @@ -180,8 +187,8 @@ public class KissCrawler { * @throws FileNotFoundException * In case configuration files cannot be found. */ - private Crawler createCrawler(String aCrawlerConfig, - HttpClient aClient, XslTransformer aTransformer) throws FileNotFoundException { + private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient, + XslTransformer aTransformer) throws FileNotFoundException { ConfigurationParser parser = new ConfigurationParser(aTransformer); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); @@ -198,15 +205,23 @@ public class KissCrawler { * URL of the electronic programme guide. * @param aCrawler * Crawler to use. + * @param aReport + * Report to use. * @return Starting page. */ - private Page getStartPage(String aStartUrl, Crawler aCrawler) { + private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport) + throws PageException { try { Page page = aCrawler.getPage(aStartUrl); - return page.getAction("channels-favorites").execute(); + Action favorites = page.getAction("channels-favorites"); + if (favorites == null) { + String msg = "Channels favorites action not found on start page"; + throw new PageException(msg); + } + return favorites.execute(); } catch (PageException e) { - throw new RuntimeException( - "Could not login to electronic program guide", e); + String msg = "Could not login to electronic programme guide."; + throw new PageException(msg, e); } } @@ -215,22 +230,32 @@ public class KissCrawler { * * @param aPage * Starting page. + * @param aReport + * Report to use. * @return TV guide. */ - private TVGuide createGuide(Page aPage) { + private TVGuide createGuide(Page aPage, Report aReport) { LOG.info("Obtaining full TV guide"); Action[] actions = aPage.getActions(); List channels = new ArrayList(); for (Action action : actions) { try { LOG.info("Getting channel info for '" + action.getName() + "'"); - Channel channel = createChannel(action.getName(), action - .execute().getAction("right-now").execute()); + Action rightNow = action.execute().getAction("right-now"); + if (rightNow == null) { + throw new PageException("Channel summary page for '" + + action.getName() + + "' does not contain required information"); + } + Channel channel = createChannel(action.getName(), rightNow + .execute(), aReport); channels.add(channel); if (SystemProperties.isDebugMode()) { break; // Only one channel is crawled. } } catch (PageException e) { + aReport.addMessage("Could not create channel information for '" + + action.getName() + "'"); LOG.error("Could not create channel information for '" + action.getName() + "'", e); } @@ -247,7 +272,7 @@ public class KissCrawler { * Starting page for the channel. * @return Channel. */ - private Channel createChannel(String aChannel, Page aPage) { + private Channel createChannel(String aChannel, Page aPage, Report aReport) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); List programs = new ArrayList(); @@ -270,9 +295,10 @@ public class KissCrawler { keywords = programInfo.getContent().element("keywords") .getText().trim(); } catch (PageException e) { - LOG.warn( - "Program details could not be determined for '" - + action.getName() + "'", e); + String msg = "Program details could not be determined for '" + + action.getName() + "'"; + aReport.addMessage(msg, e); + LOG.warn(msg, e); } } Program program = new Program(aChannel, action.getName(), diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/ProgramActionExecutor.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/ProgramActionExecutor.java index 51aef60f..c19b6473 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/main/ProgramActionExecutor.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/main/ProgramActionExecutor.java @@ -16,7 +16,6 @@ package org.wamblee.crawler.kiss.main; -import java.util.EnumMap; import java.util.HashSet; import java.util.Map; import java.util.Set; @@ -25,8 +24,6 @@ import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.dom4j.DocumentFactory; -import org.dom4j.Element; import org.wamblee.crawler.kiss.guide.Program; import org.wamblee.crawler.kiss.guide.TimeInterval; import org.wamblee.crawler.kiss.guide.Program.RecordingResult; @@ -41,35 +38,23 @@ public class ProgramActionExecutor { private static final Log LOG = LogFactory .getLog(ProgramActionExecutor.class); - /** - * A map of category name to a set of program. Useful for displaying the - * output of possibly interesting programs on a per category basis. - */ - private Map> _interestingShows; - /** * Map of priority to set of programs. */ private Map> _showsToRecord; - + /** - * Map or recording result to a set of programs. + * Report to use. */ - private EnumMap> _recordings; + private Report _report; /** * Constructs the program action executor. * */ - public ProgramActionExecutor() { - _interestingShows = new TreeMap>(); + public ProgramActionExecutor(Report aReport) { _showsToRecord = new TreeMap>(); - _recordings = new EnumMap>( - RecordingResult.class); - for (RecordingResult result : RecordingResult.values()) { - _recordings.put(result, new TreeSet( - new Program.TimeSorter())); - } + _report = aReport; } /** @@ -82,12 +67,12 @@ public class ProgramActionExecutor { */ public void recordProgram(int aPriority, Program aProgram) { LOG.info("priority = " + aPriority + ", program: " + aProgram); - // Putting -priority into the set makes sure that iteration order - // over the priorities will go from higher priority to lower priority. + // Putting -priority into the set makes sure that iteration order + // over the priorities will go from higher priority to lower priority. Set programs = _showsToRecord.get(-aPriority); if (programs == null) { programs = new TreeSet(new Program.TimeSorter()); - _showsToRecord.put(-aPriority, programs); + _showsToRecord.put(-aPriority, programs); } programs.add(aProgram); } @@ -102,12 +87,7 @@ public class ProgramActionExecutor { */ public void interestingProgram(String aCategory, Program aProgram) { LOG.info("category = '" + aCategory + "', program: " + aProgram); - Set programs = _interestingShows.get(aCategory); - if (programs == null) { - programs = new TreeSet(new Program.TimeSorter()); - _interestingShows.put(aCategory, programs); - } - programs.add(aProgram); + _report.interestingProgram(aCategory, aProgram); } /** @@ -117,80 +97,34 @@ public class ProgramActionExecutor { Set previouslyRecorded = new HashSet(); for (Integer priority : _showsToRecord.keySet()) { for (Program program : _showsToRecord.get(priority)) { - TimeInterval interval = program.getInterval(); - if ( recordingConflictExists(previouslyRecorded, interval)) { - _recordings.get(RecordingResult.CONFLICT).add(program); + TimeInterval interval = program.getInterval(); + if (recordingConflictExists(previouslyRecorded, interval)) { + _report.setRecordingResult(RecordingResult.CONFLICT, program); } else { RecordingResult result = program.record(); - _recordings.get(result).add(program); + _report.setRecordingResult(result, program); previouslyRecorded.add(interval); } } } } - - /** - * Checks an interval for overlap with a previously recorded program. - * @param aPreviouslyRecorded Previously recorded programs. - * @param interval Interval. - * @return True iff there is a recording conflict. - */ - private boolean recordingConflictExists(Set aPreviouslyRecorded, TimeInterval interval) { - for (TimeInterval recordedInterval: aPreviouslyRecorded ) { - if ( interval.overlap(recordedInterval)) { - return true; - } - } - return false; - } /** - * Get report as XML. + * Checks an interval for overlap with a previously recorded program. * - * @return XML report + * @param aPreviouslyRecorded + * Previously recorded programs. + * @param aInterval + * Interval. + * @return True iff there is a recording conflict. */ - public Element getReport() { - DocumentFactory factory = DocumentFactory.getInstance(); - Element report = factory.createElement("report"); - - Set reportedPrograms = new HashSet(); - - for (RecordingResult result : RecordingResult.values()) { - if (_recordings.get(result).size() > 0) { - Element recordingResult = report.addElement("recorded") - .addAttribute("result", result.toString()); - - for (Program program : _recordings.get(result)) { - recordingResult.add(program.asXml()); - reportedPrograms.add(program); - } - } - } - - if (_interestingShows.size() > 0) { - Element interesting = report.addElement("interesting"); - for (String category : _interestingShows.keySet()) { - Element categoryElem = interesting; - if (category.length() > 0) { - categoryElem = interesting.addElement("category"); - categoryElem.addAttribute("name", category); - } - for (Program program : _interestingShows.get(category)) { - if ( !reportedPrograms.contains(program)) { - categoryElem.add(program.asXml()); - } else { - LOG.info("Category '" + category + "', program " + program + " already reported"); - } - } - if ( categoryElem.elements().size() == 0 ) { - // Remove empty category element. - LOG.info("Removing element for category '" + category + "'"); - interesting.remove(categoryElem); - } + private boolean recordingConflictExists( + Set aPreviouslyRecorded, TimeInterval aInterval) { + for (TimeInterval recordedInterval : aPreviouslyRecorded) { + if (aInterval.overlap(recordedInterval)) { + return true; } - } - - return report; + return false; } } diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/Report.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/Report.java new file mode 100644 index 00000000..1b0406c4 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/main/Report.java @@ -0,0 +1,182 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.main; + +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.DocumentFactory; +import org.dom4j.Element; +import org.wamblee.crawler.kiss.guide.Program; +import org.wamblee.crawler.kiss.guide.Program.RecordingResult; + +/** + * Represents a report on the actions of the crawler. + */ +public class Report { + + private static final Log LOG = LogFactory + .getLog(Report.class); + + /** + * A map of category name to a set of program. Useful for displaying the + * output of possibly interesting programs on a per category basis. + */ + private Map> _interestingShows; + + /** + * Map or recording result to a set of programs. + */ + private EnumMap> _recordings; + + /** + * Messages generated while doing all the work. + */ + private List _messages; + + /** + * Constructs the program action executor. + * + */ + public Report() { + _interestingShows = new TreeMap>(); + _recordings = new EnumMap>( + RecordingResult.class); + for (RecordingResult result : RecordingResult.values()) { + _recordings.put(result, new TreeSet( + new Program.TimeSorter())); + } + _messages = new ArrayList(); + } + + /** + * Adds a message. + * + * @param aMessage + * Message to add. + */ + public void addMessage(String aMessage) { + _messages.add(aMessage); + } + + /** + * Adds a message. + * + * @param aMessage + * Message to add. + * @param aException Exception that caused the problem. + */ + public void addMessage(String aMessage, Exception aException) { + _messages.add(aMessage + ": " + aException.getMessage()); + } + + /** + * Called to indicate that a program is interesting. + * + * @param aCategory + * Category of the program. + * @param aProgram + * Program. + */ + public void interestingProgram(String aCategory, Program aProgram) { + LOG.info("category = '" + aCategory + "', program: " + aProgram); + Set programs = _interestingShows.get(aCategory); + if (programs == null) { + programs = new TreeSet(new Program.TimeSorter()); + _interestingShows.put(aCategory, programs); + } + programs.add(aProgram); + } + + /** + * Called to specify the result of recording a program. + * @param aResult Result. + * @param aProgram Program. + */ + public void setRecordingResult(RecordingResult aResult, Program aProgram) { + _recordings.get(aResult).add(aProgram); + } + + + /** + * Get report as XML. + * + * @return XML report + */ + public Element asXml() { + DocumentFactory factory = DocumentFactory.getInstance(); + Element report = factory.createElement("report"); + + if (_messages.size() > 0) { + Element messages = report.addElement("messages"); + for (String message : _messages) { + messages.addElement("message").setText(message); + } + } + + Set reportedPrograms = new HashSet(); + + for (RecordingResult result : RecordingResult.values()) { + if (_recordings.get(result).size() > 0) { + Element recordingResult = report.addElement("recorded") + .addAttribute("result", result.toString()); + + for (Program program : _recordings.get(result)) { + recordingResult.add(program.asXml()); + reportedPrograms.add(program); + } + } + } + + if (_interestingShows.size() > 0) { + Element interesting = report.addElement("interesting"); + for (String category : _interestingShows.keySet()) { + Element categoryElem = interesting; + if (category.length() > 0) { + categoryElem = interesting.addElement("category"); + categoryElem.addAttribute("name", category); + } + for (Program program : _interestingShows.get(category)) { + if (!reportedPrograms.contains(program)) { + categoryElem.add(program.asXml()); + } else { + LOG.info("Category '" + category + "', program " + + program + " already reported"); + } + } + if (categoryElem.elements().size() == 0) { + // Remove empty category element. + LOG + .info("Removing element for category '" + category + + "'"); + interesting.remove(categoryElem); + } + } + + } + + return report; + } +} diff --git a/crawler/kiss/src/reportToHtml.xsl b/crawler/kiss/src/reportToHtml.xsl index 44765a32..23bc1730 100644 --- a/crawler/kiss/src/reportToHtml.xsl +++ b/crawler/kiss/src/reportToHtml.xsl @@ -16,6 +16,7 @@ No suitable programs found + @@ -94,5 +95,17 @@

Category:

+ + +

Messages

+
    + +
  • + + +
  • +
    +
+
diff --git a/crawler/kiss/src/reportToText.xsl b/crawler/kiss/src/reportToText.xsl index f619dd1f..741a678d 100644 --- a/crawler/kiss/src/reportToText.xsl +++ b/crawler/kiss/src/reportToText.xsl @@ -16,6 +16,8 @@ + + @@ -105,4 +107,15 @@ + + Messages + + + + * + + + + + -- 2.31.1