X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fmain%2FKissCrawler.java;h=3300e1299e71b6c4268d9ba9447b65d6df17fe8f;hb=5693cee5e48f72bf1f054feadb2bc7a6045f99e5;hp=976d7d15de9382361f0cfe3118c1838d5cc9cfb9;hpb=69a4737ea1d5fc3c637c7abc301c5863529c7129;p=utils diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java index 976d7d15..3300e129 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java @@ -29,8 +29,10 @@ import java.util.regex.Pattern; import javax.mail.MessagingException; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.dom4j.Element; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; @@ -46,6 +48,7 @@ import org.wamblee.crawler.kiss.guide.Time; import org.wamblee.crawler.kiss.guide.TimeInterval; import org.wamblee.crawler.kiss.notification.NotificationException; import org.wamblee.crawler.kiss.notification.Notifier; +import org.wamblee.general.BeanFactory; import org.wamblee.xml.ClasspathUriResolver; import org.wamblee.xml.XslTransformer; @@ -60,13 +63,18 @@ public class KissCrawler { /** * Start URL of the electronic programme guide. */ - private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; + private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php"; + + /** + * Default socket timeout to use. + */ + private static final int SOCKET_TIMEOUT = 10000; /** * Regular expression for matching time interval strings in the retrieved * pages. */ - private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; + private static final String TIME_REGEX = "[^0-9]*([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; /** * Compiled pattern for the time regular expression. @@ -82,11 +90,15 @@ public class KissCrawler { * In case of problems. */ public static void main(String[] aArgs) throws Exception { - String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); - String programConfig = new File(aArgs[1]).getCanonicalPath(); - new KissCrawler(START_URL, crawlerConfig, programConfig); + String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); + String programConfig = new File(aArgs[1]).getCanonicalPath(); + + BeanFactory factory = new StandaloneCrawlerBeanFactory(); + Notifier notifier = factory.find(Notifier.class); + new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, + programConfig, notifier, new Report()); } - + /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record @@ -96,17 +108,24 @@ public class KissCrawler { * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. + * @param aNotifier + * Object used to send notifications of the results. + * @param aReport + * Report to use. * @throws IOException * In case of problems reading files. - * @throws MessagingException - * In case of problems sending a mail notification. + * @throws NotificationException + * In case notification fails. + * @throws PageException + * In case of problems retrieving the TV guide. */ - public KissCrawler(String aCrawlerConfig, - String aProgramConfig) throws IOException, NotificationException { - this(START_URL, aCrawlerConfig, aProgramConfig); + public KissCrawler(String aCrawlerConfig, String aProgramConfig, + Notifier aNotifier, Report aReport) throws IOException, + NotificationException, PageException { + this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, + aNotifier, aReport); } - /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record @@ -114,23 +133,35 @@ public class KissCrawler { * * @param aStartUrl * Start URL of the electronic programme guide. + * @param aSocketTimeout + * Socket timeout to use. * @param aCrawlerConfig * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. + * @param aNotifier + * Object used to send notifications of the results. + * @param aReport + * Report to use. * @throws IOException * In case of problems reading files. - * @throws MessagingException - * In case of problems sending a mail notification. + * @throws NotificationException + * In case notification fails. + * @throws PageException + * In case of problems retrieving the TV guide. */ - public KissCrawler(String aStartUrl, String aCrawlerConfig, - String aProgramConfig) throws IOException, NotificationException { + public KissCrawler(String aStartUrl, int aSocketTimeout, + String aCrawlerConfig, String aProgramConfig, Notifier aNotifier, + Report aReport) throws IOException, NotificationException, + PageException { _pattern = Pattern.compile(TIME_REGEX); try { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("127.0.0.1", 3128); + client.getParams().setParameter("http.socket.timeout", + SOCKET_TIMEOUT); XslTransformer transformer = new XslTransformer( new ClasspathUriResolver()); @@ -138,25 +169,22 @@ public class KissCrawler { Crawler crawler = createCrawler(aCrawlerConfig, client, transformer); InputStream programConfigFile = new FileInputStream(new File( aProgramConfig)); - ProgramConfigurationParser parser = new ProgramConfigurationParser( - transformer); + ProgramConfigurationParser parser = new ProgramConfigurationParser(); parser.parse(programConfigFile); List programFilters = parser.getFilters(); - Report report = new Report(); - try { - Page page = getStartPage(aStartUrl, crawler, report); - TVGuide guide = createGuide(page, report); + Page page = getStartPage(aStartUrl, crawler, aReport); + TVGuide guide = createGuide(page, aReport); PrintVisitor printer = new PrintVisitor(System.out); guide.accept(printer); - processResults(programFilters, guide, parser.getNotifier(), - report); + processResults(programFilters, guide, aNotifier, aReport); } catch (PageException e) { - report.addMessage("Problem getting TV guide", e); + aReport.addMessage("Problem getting TV guide", e); LOG.info("Problem getting TV guide", e); + throw e; } - parser.getNotifier().send(report.asXml()); + aNotifier.send(aReport.asXml()); } finally { System.out.println("Crawler finished"); } @@ -224,7 +252,8 @@ public class KissCrawler { private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport) throws PageException { try { - Page page = aCrawler.getPage(aStartUrl); + Page page = aCrawler.getPage(aStartUrl, new NameValuePair[0]); + page = page.getAction("login").execute(); Action favorites = page.getAction("channels-favorites"); if (favorites == null) { String msg = "Channels favorites action not found on start page"; @@ -245,21 +274,28 @@ public class KissCrawler { * @param aReport * Report to use. * @return TV guide. + * @throws PageException + * In case of problem getting the tv guide. */ - private TVGuide createGuide(Page aPage, Report aReport) { + private TVGuide createGuide(Page aPage, Report aReport) + throws PageException { LOG.info("Obtaining full TV guide"); Action[] actions = aPage.getActions(); + if (actions.length == 0) { + LOG.error("No channels found"); + throw new PageException("No channels found"); + } List channels = new ArrayList(); for (Action action : actions) { try { LOG.info("Getting channel info for '" + action.getName() + "'"); - Action rightNow = action.execute().getAction("right-now"); - if (rightNow == null) { + Action tomorrow = action.execute().getAction("tomorrow"); + if (tomorrow == null) { throw new PageException("Channel summary page for '" + action.getName() + "' does not contain required information"); } - Channel channel = createChannel(action.getName(), rightNow + Channel channel = createChannel(action.getName(), tomorrow .execute(), aReport); channels.add(channel); if (SystemProperties.isDebugMode()) { @@ -299,18 +335,25 @@ public class KissCrawler { TimeInterval interval = new TimeInterval(begin, end); String description = ""; String keywords = ""; + if (!SystemProperties.isNoProgramDetailsRequired()) { - try { - Page programInfo = action.execute(); - description = programInfo.getContent().element( - "description").getText().trim(); - keywords = programInfo.getContent().element("keywords") - .getText().trim(); - } catch (PageException e) { - String msg = "Program details could not be determined for '" - + action.getName() + "'"; - aReport.addMessage(msg, e); - LOG.warn(msg, e); + Element descriptionElem = action.getContent().element( + "description"); + if (descriptionElem == null) { + try { + Page programInfo = action.execute(); + description = programInfo.getContent().element( + "description").getText().trim(); + keywords = programInfo.getContent().element( + "keywords").getText().trim(); + } catch (PageException e) { + String msg = "Program details could not be determined for '" + + action.getName() + "'"; + aReport.addMessage(msg, e); + LOG.warn(msg, e); + } + } else { + description = descriptionElem.getTextTrim(); } } Program program = new Program(aChannel, action.getName(),