/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.kiss.main; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.mail.MessagingException; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; import org.wamblee.crawler.PageException; import org.wamblee.crawler.impl.ConfigurationParser; import org.wamblee.crawler.impl.CrawlerImpl; import org.wamblee.crawler.kiss.guide.Channel; import org.wamblee.crawler.kiss.guide.PrintVisitor; import org.wamblee.crawler.kiss.guide.Program; import org.wamblee.crawler.kiss.guide.TVGuide; import org.wamblee.crawler.kiss.guide.Time; import org.wamblee.crawler.kiss.guide.TimeInterval; import org.wamblee.crawler.kiss.notification.NotificationException; import org.wamblee.crawler.kiss.notification.Notifier; import org.wamblee.general.BeanFactory; import org.wamblee.xml.ClasspathUriResolver; import org.wamblee.xml.XslTransformer; /** * The KiSS crawler for automatic recording of interesting TV shows. * */ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); /** * Start URL of the electronic programme guide. */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; /** * Default socket timeout to use. */ private static final int SOCKET_TIMEOUT = 10000; /** * Regular expression for matching time interval strings in the retrieved * pages. */ private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; /** * Compiled pattern for the time regular expression. */ private Pattern _pattern; /** * Runs the KiSS crawler. * * @param aArgs * Arguments, currently all ignored because they are hardcoded. * @throws Exception * In case of problems. */ public static void main(String[] aArgs) throws Exception { String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); String programConfig = new File(aArgs[1]).getCanonicalPath(); BeanFactory factory = new StandaloneCrawlerBeanFactory(); Notifier notifier = factory.find(Notifier.class); new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig, notifier, new Report()); } /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record * them, and sends a summary mail to the user. * * @param aCrawlerConfig * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. * @param aNotifier Object used to send notifications of the results. * @param aReport Report to use. * @throws IOException * In case of problems reading files. * @throws NotificationException In case notification fails. * @throws PageException In case of problems retrieving the TV guide. */ public KissCrawler(String aCrawlerConfig, String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException { this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, aNotifier, aReport); } /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record * them, and sends a summary mail to the user. * * @param aStartUrl * Start URL of the electronic programme guide. * @param aSocketTimeout Socket timeout to use. * @param aCrawlerConfig * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. * @param aNotifier Object used to send notifications of the results. * @param aReport Report to use. * @throws IOException * In case of problems reading files. * @throws NotificationException In case notification fails. * @throws PageException In case of problems retrieving the TV guide. */ public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig, String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException { _pattern = Pattern.compile(TIME_REGEX); try { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("127.0.0.1", 3128); client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT); XslTransformer transformer = new XslTransformer( new ClasspathUriResolver()); Crawler crawler = createCrawler(aCrawlerConfig, client, transformer); InputStream programConfigFile = new FileInputStream(new File( aProgramConfig)); ProgramConfigurationParser parser = new ProgramConfigurationParser( transformer); parser.parse(programConfigFile); List programFilters = parser.getFilters(); try { Page page = getStartPage(aStartUrl, crawler, aReport); TVGuide guide = createGuide(page, aReport); PrintVisitor printer = new PrintVisitor(System.out); guide.accept(printer); processResults(programFilters, guide, aNotifier, aReport); } catch (PageException e) { aReport.addMessage("Problem getting TV guide", e); LOG.info("Problem getting TV guide", e); throw e; } aNotifier.send(aReport.asXml()); } finally { System.out.println("Crawler finished"); } } /** * Records interesting shows. * * @param aProgramCondition * Condition determining which shows are interesting. * @param aGuide * Television guide. * @throws MessagingException * In case of problems sending a summary mail. */ private void processResults(List aProgramCondition, TVGuide aGuide, Notifier aNotifier, Report aReport) { ProgramActionExecutor executor = new ProgramActionExecutor(aReport); for (ProgramFilter filter : aProgramCondition) { List programs = filter.apply(aGuide); ProgramAction action = filter.getAction(); for (Program program : programs) { action.execute(program, executor); } } executor.commit(); } /** * Creates the crawler. * * @param aCrawlerConfig * Crawler configuration file. * @param aOs * Logging output stream for the crawler. * @param aClient * HTTP Client to use. * @return Crawler. * @throws FileNotFoundException * In case configuration files cannot be found. */ private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient, XslTransformer aTransformer) throws FileNotFoundException { ConfigurationParser parser = new ConfigurationParser(aTransformer); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); Crawler crawler = new CrawlerImpl(aClient, config); return crawler; } /** * Gets the start page of the electronic programme guide. This involves * login and navigation to a suitable start page after logging in. * * @param aStartUrl * URL of the electronic programme guide. * @param aCrawler * Crawler to use. * @param aReport * Report to use. * @return Starting page. */ private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport) throws PageException { try { Page page = aCrawler.getPage(aStartUrl); Action favorites = page.getAction("channels-favorites"); if (favorites == null) { String msg = "Channels favorites action not found on start page"; throw new PageException(msg); } return favorites.execute(); } catch (PageException e) { String msg = "Could not complete login to electronic programme guide."; throw new PageException(msg, e); } } /** * Creates the TV guide by web crawling. * * @param aPage * Starting page. * @param aReport * Report to use. * @return TV guide. */ private TVGuide createGuide(Page aPage, Report aReport) { LOG.info("Obtaining full TV guide"); Action[] actions = aPage.getActions(); if ( actions.length == 0 ) { LOG.error("No channels found"); aReport.addMessage("No channels found"); } List channels = new ArrayList(); for (Action action : actions) { try { LOG.info("Getting channel info for '" + action.getName() + "'"); Action rightNow = action.execute().getAction("right-now"); if (rightNow == null) { throw new PageException("Channel summary page for '" + action.getName() + "' does not contain required information"); } Channel channel = createChannel(action.getName(), rightNow .execute(), aReport); channels.add(channel); if (SystemProperties.isDebugMode()) { break; // Only one channel is crawled. } } catch (PageException e) { aReport.addMessage("Could not create channel information for '" + action.getName() + "'"); LOG.error("Could not create channel information for '" + action.getName() + "'", e); } } return new TVGuide(channels); } /** * Create channel information for a specific channel. * * @param aChannel * Channel name. * @param aPage * Starting page for the channel. * @return Channel. */ private Channel createChannel(String aChannel, Page aPage, Report aReport) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); List programs = new ArrayList(); for (Action action : programActions) { String time = action.getContent().element("time").getText().trim(); Matcher matcher = _pattern.matcher(time); if (matcher.matches()) { Time begin = new Time(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))); Time end = new Time(Integer.parseInt(matcher.group(3)), Integer .parseInt(matcher.group(4))); TimeInterval interval = new TimeInterval(begin, end); String description = ""; String keywords = ""; if (!SystemProperties.isNoProgramDetailsRequired()) { try { Page programInfo = action.execute(); description = programInfo.getContent().element( "description").getText().trim(); keywords = programInfo.getContent().element("keywords") .getText().trim(); } catch (PageException e) { String msg = "Program details could not be determined for '" + action.getName() + "'"; aReport.addMessage(msg, e); LOG.warn(msg, e); } } Program program = new Program(aChannel, action.getName(), description, keywords, interval, action); LOG.info("Got program " + program); programs.add(program); } } return new Channel(aChannel, programs); } }