From: erik Date: Wed, 23 Aug 2006 22:13:15 +0000 (+0000) Subject: Now crawling in desktop mode should work. It is much more efficient X-Git-Tag: BEFORE_MAVEN_MIGRATION~9 X-Git-Url: http://wamblee.org/gitweb/?a=commitdiff_plain;ds=sidebyside;h=5693cee5e48f72bf1f054feadb2bc7a6045f99e5;hp=e359f7995fcafb2ce18576be6f21345f9f7b1482;p=utils Now crawling in desktop mode should work. It is much more efficient since it does not need to click on each and every program to obtain program information. Now, the crawler examines the next day instead of the current day and it is scheduled to run between 19:00 and 24:00. --- diff --git a/.classpath b/.classpath index af50c67a..918f2c28 100644 --- a/.classpath +++ b/.classpath @@ -50,7 +50,7 @@ - + diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java index da3fd2be..e88f4007 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -61,7 +61,7 @@ public class ConfigurationParser { private static final int MAX_TRIES = 3; - private static final int MAX_DELAY = 5000; + private static final int MAX_DELAY = 10000; private XslTransformer _transformer; diff --git a/crawler/kiss/src/channel-right-now-graphic.xsl b/crawler/kiss/src/channel-right-now-graphic.xsl index 43356172..6b5070c4 100644 --- a/crawler/kiss/src/channel-right-now-graphic.xsl +++ b/crawler/kiss/src/channel-right-now-graphic.xsl @@ -1,13 +1,12 @@ - + xmlns:xhtml="http://www.w3.org/1999/xhtml" version="1.0"> + - + - + @@ -16,55 +15,55 @@ - + - + - + - - - - - - - - - - program-info - - - - - - - - - - - - - + + + + + + + + + + program-info + + + + + + + + + + + + + - + - + - - > - - - - - - + + + + + + - + diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java index 3191ff24..3300e129 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java @@ -32,6 +32,7 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.dom4j.Element; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; @@ -63,11 +64,11 @@ public class KissCrawler { * Start URL of the electronic programme guide. */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php"; - + /** - * Default socket timeout to use. + * Default socket timeout to use. */ - private static final int SOCKET_TIMEOUT = 10000; + private static final int SOCKET_TIMEOUT = 10000; /** * Regular expression for matching time interval strings in the retrieved @@ -89,14 +90,15 @@ public class KissCrawler { * In case of problems. */ public static void main(String[] aArgs) throws Exception { - String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); - String programConfig = new File(aArgs[1]).getCanonicalPath(); + String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); + String programConfig = new File(aArgs[1]).getCanonicalPath(); BeanFactory factory = new StandaloneCrawlerBeanFactory(); Notifier notifier = factory.find(Notifier.class); - new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig, notifier, new Report()); + new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, + programConfig, notifier, new Report()); } - + /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record @@ -106,19 +108,24 @@ public class KissCrawler { * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. - * @param aNotifier Object used to send notifications of the results. - * @param aReport Report to use. + * @param aNotifier + * Object used to send notifications of the results. + * @param aReport + * Report to use. * @throws IOException * In case of problems reading files. - * @throws NotificationException In case notification fails. - * @throws PageException In case of problems retrieving the TV guide. + * @throws NotificationException + * In case notification fails. + * @throws PageException + * In case of problems retrieving the TV guide. */ - public KissCrawler(String aCrawlerConfig, - String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException { - this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, aNotifier, aReport); + public KissCrawler(String aCrawlerConfig, String aProgramConfig, + Notifier aNotifier, Report aReport) throws IOException, + NotificationException, PageException { + this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, + aNotifier, aReport); } - /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record @@ -126,27 +133,35 @@ public class KissCrawler { * * @param aStartUrl * Start URL of the electronic programme guide. - * @param aSocketTimeout Socket timeout to use. + * @param aSocketTimeout + * Socket timeout to use. * @param aCrawlerConfig * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. - * @param aNotifier Object used to send notifications of the results. - * @param aReport Report to use. + * @param aNotifier + * Object used to send notifications of the results. + * @param aReport + * Report to use. * @throws IOException * In case of problems reading files. - * @throws NotificationException In case notification fails. - * @throws PageException In case of problems retrieving the TV guide. + * @throws NotificationException + * In case notification fails. + * @throws PageException + * In case of problems retrieving the TV guide. */ - public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig, - String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException { + public KissCrawler(String aStartUrl, int aSocketTimeout, + String aCrawlerConfig, String aProgramConfig, Notifier aNotifier, + Report aReport) throws IOException, NotificationException, + PageException { _pattern = Pattern.compile(TIME_REGEX); try { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("127.0.0.1", 3128); - client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT); + client.getParams().setParameter("http.socket.timeout", + SOCKET_TIMEOUT); XslTransformer transformer = new XslTransformer( new ClasspathUriResolver()); @@ -163,12 +178,11 @@ public class KissCrawler { TVGuide guide = createGuide(page, aReport); PrintVisitor printer = new PrintVisitor(System.out); guide.accept(printer); - processResults(programFilters, guide, aNotifier, - aReport); + processResults(programFilters, guide, aNotifier, aReport); } catch (PageException e) { aReport.addMessage("Problem getting TV guide", e); LOG.info("Problem getting TV guide", e); - throw e; + throw e; } aNotifier.send(aReport.asXml()); } finally { @@ -260,12 +274,14 @@ public class KissCrawler { * @param aReport * Report to use. * @return TV guide. - * @throws PageException In case of problem getting the tv guide. + * @throws PageException + * In case of problem getting the tv guide. */ - private TVGuide createGuide(Page aPage, Report aReport) throws PageException { + private TVGuide createGuide(Page aPage, Report aReport) + throws PageException { LOG.info("Obtaining full TV guide"); Action[] actions = aPage.getActions(); - if ( actions.length == 0 ) { + if (actions.length == 0) { LOG.error("No channels found"); throw new PageException("No channels found"); } @@ -273,13 +289,13 @@ public class KissCrawler { for (Action action : actions) { try { LOG.info("Getting channel info for '" + action.getName() + "'"); - Action rightNow = action.execute().getAction("right-now"); - if (rightNow == null) { + Action tomorrow = action.execute().getAction("tomorrow"); + if (tomorrow == null) { throw new PageException("Channel summary page for '" + action.getName() + "' does not contain required information"); } - Channel channel = createChannel(action.getName(), rightNow + Channel channel = createChannel(action.getName(), tomorrow .execute(), aReport); channels.add(channel); if (SystemProperties.isDebugMode()) { @@ -319,18 +335,25 @@ public class KissCrawler { TimeInterval interval = new TimeInterval(begin, end); String description = ""; String keywords = ""; + if (!SystemProperties.isNoProgramDetailsRequired()) { - try { - Page programInfo = action.execute(); - description = programInfo.getContent().element( - "description").getText().trim(); - keywords = programInfo.getContent().element("keywords") - .getText().trim(); - } catch (PageException e) { - String msg = "Program details could not be determined for '" - + action.getName() + "'"; - aReport.addMessage(msg, e); - LOG.warn(msg, e); + Element descriptionElem = action.getContent().element( + "description"); + if (descriptionElem == null) { + try { + Page programInfo = action.execute(); + description = programInfo.getContent().element( + "description").getText().trim(); + keywords = programInfo.getContent().element( + "keywords").getText().trim(); + } catch (PageException e) { + String msg = "Program details could not be determined for '" + + action.getName() + "'"; + aReport.addMessage(msg, e); + LOG.warn(msg, e); + } + } else { + description = descriptionElem.getTextTrim(); } } Program program = new Program(aChannel, action.getName(), diff --git a/crawler/kissweb/src/org.wamblee.crawler.kiss.xml b/crawler/kissweb/src/org.wamblee.crawler.kiss.xml index 6f0e2c96..022b4aa0 100644 --- a/crawler/kissweb/src/org.wamblee.crawler.kiss.xml +++ b/crawler/kissweb/src/org.wamblee.crawler.kiss.xml @@ -24,7 +24,7 @@ - 5 + 19 24