X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkiss%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fmain%2FKissCrawler.java;fp=crawler%2Fkiss%2Fsrc%2Fmain%2Fjava%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fmain%2FKissCrawler.java;h=3300e1299e71b6c4268d9ba9447b65d6df17fe8f;hb=62f165891f08ae532b5a794af11d7338a93f9a43;hp=0000000000000000000000000000000000000000;hpb=07cedd3f0730646ea35a7f668b3e1e872a4605d9;p=utils diff --git a/crawler/kiss/src/main/java/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/main/java/org/wamblee/crawler/kiss/main/KissCrawler.java new file mode 100644 index 00000000..3300e129 --- /dev/null +++ b/crawler/kiss/src/main/java/org/wamblee/crawler/kiss/main/KissCrawler.java @@ -0,0 +1,368 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss.main; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.mail.MessagingException; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Element; +import org.wamblee.crawler.Action; +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.Crawler; +import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; +import org.wamblee.crawler.impl.ConfigurationParser; +import org.wamblee.crawler.impl.CrawlerImpl; +import org.wamblee.crawler.kiss.guide.Channel; +import org.wamblee.crawler.kiss.guide.PrintVisitor; +import org.wamblee.crawler.kiss.guide.Program; +import org.wamblee.crawler.kiss.guide.TVGuide; +import org.wamblee.crawler.kiss.guide.Time; +import org.wamblee.crawler.kiss.guide.TimeInterval; +import org.wamblee.crawler.kiss.notification.NotificationException; +import org.wamblee.crawler.kiss.notification.Notifier; +import org.wamblee.general.BeanFactory; +import org.wamblee.xml.ClasspathUriResolver; +import org.wamblee.xml.XslTransformer; + +/** + * The KiSS crawler for automatic recording of interesting TV shows. + * + */ +public class KissCrawler { + + private static final Log LOG = LogFactory.getLog(KissCrawler.class); + + /** + * Start URL of the electronic programme guide. + */ + private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php"; + + /** + * Default socket timeout to use. + */ + private static final int SOCKET_TIMEOUT = 10000; + + /** + * Regular expression for matching time interval strings in the retrieved + * pages. + */ + private static final String TIME_REGEX = "[^0-9]*([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; + + /** + * Compiled pattern for the time regular expression. + */ + private Pattern _pattern; + + /** + * Runs the KiSS crawler. + * + * @param aArgs + * Arguments, currently all ignored because they are hardcoded. + * @throws Exception + * In case of problems. + */ + public static void main(String[] aArgs) throws Exception { + String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); + String programConfig = new File(aArgs[1]).getCanonicalPath(); + + BeanFactory factory = new StandaloneCrawlerBeanFactory(); + Notifier notifier = factory.find(Notifier.class); + new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, + programConfig, notifier, new Report()); + } + + /** + * Constructs the crawler. This retrieves the TV guide by crawling the KiSS + * EPG guide, filters the guide for interesting programs, tries to record + * them, and sends a summary mail to the user. + * + * @param aCrawlerConfig + * Configuration file for the crawler. + * @param aProgramConfig + * Configuration file describing interesting shows. + * @param aNotifier + * Object used to send notifications of the results. + * @param aReport + * Report to use. + * @throws IOException + * In case of problems reading files. + * @throws NotificationException + * In case notification fails. + * @throws PageException + * In case of problems retrieving the TV guide. + */ + public KissCrawler(String aCrawlerConfig, String aProgramConfig, + Notifier aNotifier, Report aReport) throws IOException, + NotificationException, PageException { + this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, + aNotifier, aReport); + } + + /** + * Constructs the crawler. This retrieves the TV guide by crawling the KiSS + * EPG guide, filters the guide for interesting programs, tries to record + * them, and sends a summary mail to the user. + * + * @param aStartUrl + * Start URL of the electronic programme guide. + * @param aSocketTimeout + * Socket timeout to use. + * @param aCrawlerConfig + * Configuration file for the crawler. + * @param aProgramConfig + * Configuration file describing interesting shows. + * @param aNotifier + * Object used to send notifications of the results. + * @param aReport + * Report to use. + * @throws IOException + * In case of problems reading files. + * @throws NotificationException + * In case notification fails. + * @throws PageException + * In case of problems retrieving the TV guide. + */ + public KissCrawler(String aStartUrl, int aSocketTimeout, + String aCrawlerConfig, String aProgramConfig, Notifier aNotifier, + Report aReport) throws IOException, NotificationException, + PageException { + + _pattern = Pattern.compile(TIME_REGEX); + + try { + HttpClient client = new HttpClient(); + // client.getHostConfiguration().setProxy("127.0.0.1", 3128); + client.getParams().setParameter("http.socket.timeout", + SOCKET_TIMEOUT); + + XslTransformer transformer = new XslTransformer( + new ClasspathUriResolver()); + + Crawler crawler = createCrawler(aCrawlerConfig, client, transformer); + InputStream programConfigFile = new FileInputStream(new File( + aProgramConfig)); + ProgramConfigurationParser parser = new ProgramConfigurationParser(); + parser.parse(programConfigFile); + List programFilters = parser.getFilters(); + + try { + Page page = getStartPage(aStartUrl, crawler, aReport); + TVGuide guide = createGuide(page, aReport); + PrintVisitor printer = new PrintVisitor(System.out); + guide.accept(printer); + processResults(programFilters, guide, aNotifier, aReport); + } catch (PageException e) { + aReport.addMessage("Problem getting TV guide", e); + LOG.info("Problem getting TV guide", e); + throw e; + } + aNotifier.send(aReport.asXml()); + } finally { + System.out.println("Crawler finished"); + } + } + + /** + * Records interesting shows. + * + * @param aProgramCondition + * Condition determining which shows are interesting. + * @param aGuide + * Television guide. + * @throws MessagingException + * In case of problems sending a summary mail. + */ + private void processResults(List aProgramCondition, + TVGuide aGuide, Notifier aNotifier, Report aReport) { + ProgramActionExecutor executor = new ProgramActionExecutor(aReport); + for (ProgramFilter filter : aProgramCondition) { + List programs = filter.apply(aGuide); + ProgramAction action = filter.getAction(); + for (Program program : programs) { + action.execute(program, executor); + } + } + executor.commit(); + + } + + /** + * Creates the crawler. + * + * @param aCrawlerConfig + * Crawler configuration file. + * @param aOs + * Logging output stream for the crawler. + * @param aClient + * HTTP Client to use. + * @return Crawler. + * @throws FileNotFoundException + * In case configuration files cannot be found. + */ + private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient, + XslTransformer aTransformer) throws FileNotFoundException { + ConfigurationParser parser = new ConfigurationParser(aTransformer); + InputStream crawlerConfigFile = new FileInputStream(new File( + aCrawlerConfig)); + Configuration config = parser.parse(crawlerConfigFile); + Crawler crawler = new CrawlerImpl(aClient, config); + return crawler; + } + + /** + * Gets the start page of the electronic programme guide. This involves + * login and navigation to a suitable start page after logging in. + * + * @param aStartUrl + * URL of the electronic programme guide. + * @param aCrawler + * Crawler to use. + * @param aReport + * Report to use. + * @return Starting page. + */ + private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport) + throws PageException { + try { + Page page = aCrawler.getPage(aStartUrl, new NameValuePair[0]); + page = page.getAction("login").execute(); + Action favorites = page.getAction("channels-favorites"); + if (favorites == null) { + String msg = "Channels favorites action not found on start page"; + throw new PageException(msg); + } + return favorites.execute(); + } catch (PageException e) { + String msg = "Could not complete login to electronic programme guide."; + throw new PageException(msg, e); + } + } + + /** + * Creates the TV guide by web crawling. + * + * @param aPage + * Starting page. + * @param aReport + * Report to use. + * @return TV guide. + * @throws PageException + * In case of problem getting the tv guide. + */ + private TVGuide createGuide(Page aPage, Report aReport) + throws PageException { + LOG.info("Obtaining full TV guide"); + Action[] actions = aPage.getActions(); + if (actions.length == 0) { + LOG.error("No channels found"); + throw new PageException("No channels found"); + } + List channels = new ArrayList(); + for (Action action : actions) { + try { + LOG.info("Getting channel info for '" + action.getName() + "'"); + Action tomorrow = action.execute().getAction("tomorrow"); + if (tomorrow == null) { + throw new PageException("Channel summary page for '" + + action.getName() + + "' does not contain required information"); + } + Channel channel = createChannel(action.getName(), tomorrow + .execute(), aReport); + channels.add(channel); + if (SystemProperties.isDebugMode()) { + break; // Only one channel is crawled. + } + } catch (PageException e) { + aReport.addMessage("Could not create channel information for '" + + action.getName() + "'"); + LOG.error("Could not create channel information for '" + + action.getName() + "'", e); + } + } + return new TVGuide(channels); + } + + /** + * Create channel information for a specific channel. + * + * @param aChannel + * Channel name. + * @param aPage + * Starting page for the channel. + * @return Channel. + */ + private Channel createChannel(String aChannel, Page aPage, Report aReport) { + LOG.info("Obtaining program for " + aChannel); + Action[] programActions = aPage.getActions(); + List programs = new ArrayList(); + for (Action action : programActions) { + String time = action.getContent().element("time").getText().trim(); + Matcher matcher = _pattern.matcher(time); + if (matcher.matches()) { + Time begin = new Time(Integer.parseInt(matcher.group(1)), + Integer.parseInt(matcher.group(2))); + Time end = new Time(Integer.parseInt(matcher.group(3)), Integer + .parseInt(matcher.group(4))); + TimeInterval interval = new TimeInterval(begin, end); + String description = ""; + String keywords = ""; + + if (!SystemProperties.isNoProgramDetailsRequired()) { + Element descriptionElem = action.getContent().element( + "description"); + if (descriptionElem == null) { + try { + Page programInfo = action.execute(); + description = programInfo.getContent().element( + "description").getText().trim(); + keywords = programInfo.getContent().element( + "keywords").getText().trim(); + } catch (PageException e) { + String msg = "Program details could not be determined for '" + + action.getName() + "'"; + aReport.addMessage(msg, e); + LOG.warn(msg, e); + } + } else { + description = descriptionElem.getTextTrim(); + } + } + Program program = new Program(aChannel, action.getName(), + description, keywords, interval, action); + + LOG.info("Got program " + program); + programs.add(program); + } + } + return new Channel(aChannel, programs); + } +}