+++ /dev/null
-/*
- * Copyright 2005 the original author or authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.wamblee.crawler.kiss.main;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.mail.MessagingException;
-
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.NameValuePair;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.dom4j.Element;
-import org.wamblee.crawler.Action;
-import org.wamblee.crawler.Configuration;
-import org.wamblee.crawler.Crawler;
-import org.wamblee.crawler.Page;
-import org.wamblee.crawler.PageException;
-import org.wamblee.crawler.impl.ConfigurationParser;
-import org.wamblee.crawler.impl.CrawlerImpl;
-import org.wamblee.crawler.kiss.guide.Channel;
-import org.wamblee.crawler.kiss.guide.PrintVisitor;
-import org.wamblee.crawler.kiss.guide.Program;
-import org.wamblee.crawler.kiss.guide.TVGuide;
-import org.wamblee.crawler.kiss.guide.Time;
-import org.wamblee.crawler.kiss.guide.TimeInterval;
-import org.wamblee.crawler.kiss.notification.NotificationException;
-import org.wamblee.crawler.kiss.notification.Notifier;
-import org.wamblee.general.BeanFactory;
-import org.wamblee.xml.ClasspathUriResolver;
-import org.wamblee.xml.XslTransformer;
-
-/**
- * The KiSS crawler for automatic recording of interesting TV shows.
- *
- */
-public class KissCrawler {
-
- private static final Log LOG = LogFactory.getLog(KissCrawler.class);
-
- /**
- * Start URL of the electronic programme guide.
- */
- private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php";
-
- /**
- * Default socket timeout to use.
- */
- private static final int SOCKET_TIMEOUT = 10000;
-
- /**
- * Regular expression for matching time interval strings in the retrieved
- * pages.
- */
- private static final String TIME_REGEX = "[^0-9]*([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
-
- /**
- * Compiled pattern for the time regular expression.
- */
- private Pattern _pattern;
-
- /**
- * Runs the KiSS crawler.
- *
- * @param aArgs
- * Arguments, currently all ignored because they are hardcoded.
- * @throws Exception
- * In case of problems.
- */
- public static void main(String[] aArgs) throws Exception {
- String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
- String programConfig = new File(aArgs[1]).getCanonicalPath();
-
- BeanFactory factory = new StandaloneCrawlerBeanFactory();
- Notifier notifier = factory.find(Notifier.class);
- new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig,
- programConfig, notifier, new Report());
- }
-
- /**
- * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
- * EPG guide, filters the guide for interesting programs, tries to record
- * them, and sends a summary mail to the user.
- *
- * @param aCrawlerConfig
- * Configuration file for the crawler.
- * @param aProgramConfig
- * Configuration file describing interesting shows.
- * @param aNotifier
- * Object used to send notifications of the results.
- * @param aReport
- * Report to use.
- * @throws IOException
- * In case of problems reading files.
- * @throws NotificationException
- * In case notification fails.
- * @throws PageException
- * In case of problems retrieving the TV guide.
- */
- public KissCrawler(String aCrawlerConfig, String aProgramConfig,
- Notifier aNotifier, Report aReport) throws IOException,
- NotificationException, PageException {
- this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig,
- aNotifier, aReport);
- }
-
- /**
- * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
- * EPG guide, filters the guide for interesting programs, tries to record
- * them, and sends a summary mail to the user.
- *
- * @param aStartUrl
- * Start URL of the electronic programme guide.
- * @param aSocketTimeout
- * Socket timeout to use.
- * @param aCrawlerConfig
- * Configuration file for the crawler.
- * @param aProgramConfig
- * Configuration file describing interesting shows.
- * @param aNotifier
- * Object used to send notifications of the results.
- * @param aReport
- * Report to use.
- * @throws IOException
- * In case of problems reading files.
- * @throws NotificationException
- * In case notification fails.
- * @throws PageException
- * In case of problems retrieving the TV guide.
- */
- public KissCrawler(String aStartUrl, int aSocketTimeout,
- String aCrawlerConfig, String aProgramConfig, Notifier aNotifier,
- Report aReport) throws IOException, NotificationException,
- PageException {
-
- _pattern = Pattern.compile(TIME_REGEX);
-
- try {
- HttpClient client = new HttpClient();
- // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
- client.getParams().setParameter("http.socket.timeout",
- SOCKET_TIMEOUT);
-
- XslTransformer transformer = new XslTransformer(
- new ClasspathUriResolver());
-
- Crawler crawler = createCrawler(aCrawlerConfig, client, transformer);
- InputStream programConfigFile = new FileInputStream(new File(
- aProgramConfig));
- ProgramConfigurationParser parser = new ProgramConfigurationParser();
- parser.parse(programConfigFile);
- List<ProgramFilter> programFilters = parser.getFilters();
-
- try {
- Page page = getStartPage(aStartUrl, crawler, aReport);
- TVGuide guide = createGuide(page, aReport);
- PrintVisitor printer = new PrintVisitor(System.out);
- guide.accept(printer);
- processResults(programFilters, guide, aNotifier, aReport);
- } catch (PageException e) {
- aReport.addMessage("Problem getting TV guide", e);
- LOG.info("Problem getting TV guide", e);
- throw e;
- }
- aNotifier.send(aReport.asXml());
- } finally {
- System.out.println("Crawler finished");
- }
- }
-
- /**
- * Records interesting shows.
- *
- * @param aProgramCondition
- * Condition determining which shows are interesting.
- * @param aGuide
- * Television guide.
- * @throws MessagingException
- * In case of problems sending a summary mail.
- */
- private void processResults(List<ProgramFilter> aProgramCondition,
- TVGuide aGuide, Notifier aNotifier, Report aReport) {
- ProgramActionExecutor executor = new ProgramActionExecutor(aReport);
- for (ProgramFilter filter : aProgramCondition) {
- List<Program> programs = filter.apply(aGuide);
- ProgramAction action = filter.getAction();
- for (Program program : programs) {
- action.execute(program, executor);
- }
- }
- executor.commit();
-
- }
-
- /**
- * Creates the crawler.
- *
- * @param aCrawlerConfig
- * Crawler configuration file.
- * @param aOs
- * Logging output stream for the crawler.
- * @param aClient
- * HTTP Client to use.
- * @return Crawler.
- * @throws FileNotFoundException
- * In case configuration files cannot be found.
- */
- private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient,
- XslTransformer aTransformer) throws FileNotFoundException {
- ConfigurationParser parser = new ConfigurationParser(aTransformer);
- InputStream crawlerConfigFile = new FileInputStream(new File(
- aCrawlerConfig));
- Configuration config = parser.parse(crawlerConfigFile);
- Crawler crawler = new CrawlerImpl(aClient, config);
- return crawler;
- }
-
- /**
- * Gets the start page of the electronic programme guide. This involves
- * login and navigation to a suitable start page after logging in.
- *
- * @param aStartUrl
- * URL of the electronic programme guide.
- * @param aCrawler
- * Crawler to use.
- * @param aReport
- * Report to use.
- * @return Starting page.
- */
- private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport)
- throws PageException {
- try {
- Page page = aCrawler.getPage(aStartUrl, new NameValuePair[0]);
- page = page.getAction("login").execute();
- Action favorites = page.getAction("channels-favorites");
- if (favorites == null) {
- String msg = "Channels favorites action not found on start page";
- throw new PageException(msg);
- }
- return favorites.execute();
- } catch (PageException e) {
- String msg = "Could not complete login to electronic programme guide.";
- throw new PageException(msg, e);
- }
- }
-
- /**
- * Creates the TV guide by web crawling.
- *
- * @param aPage
- * Starting page.
- * @param aReport
- * Report to use.
- * @return TV guide.
- * @throws PageException
- * In case of problem getting the tv guide.
- */
- private TVGuide createGuide(Page aPage, Report aReport)
- throws PageException {
- LOG.info("Obtaining full TV guide");
- Action[] actions = aPage.getActions();
- if (actions.length == 0) {
- LOG.error("No channels found");
- throw new PageException("No channels found");
- }
- List<Channel> channels = new ArrayList<Channel>();
- for (Action action : actions) {
- try {
- LOG.info("Getting channel info for '" + action.getName() + "'");
- Action tomorrow = action.execute().getAction("tomorrow");
- if (tomorrow == null) {
- throw new PageException("Channel summary page for '"
- + action.getName()
- + "' does not contain required information");
- }
- Channel channel = createChannel(action.getName(), tomorrow
- .execute(), aReport);
- channels.add(channel);
- if (SystemProperties.isDebugMode()) {
- break; // Only one channel is crawled.
- }
- } catch (PageException e) {
- aReport.addMessage("Could not create channel information for '"
- + action.getName() + "'");
- LOG.error("Could not create channel information for '"
- + action.getName() + "'", e);
- }
- }
- return new TVGuide(channels);
- }
-
- /**
- * Create channel information for a specific channel.
- *
- * @param aChannel
- * Channel name.
- * @param aPage
- * Starting page for the channel.
- * @return Channel.
- */
- private Channel createChannel(String aChannel, Page aPage, Report aReport) {
- LOG.info("Obtaining program for " + aChannel);
- Action[] programActions = aPage.getActions();
- List<Program> programs = new ArrayList<Program>();
- for (Action action : programActions) {
- String time = action.getContent().element("time").getText().trim();
- Matcher matcher = _pattern.matcher(time);
- if (matcher.matches()) {
- Time begin = new Time(Integer.parseInt(matcher.group(1)),
- Integer.parseInt(matcher.group(2)));
- Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
- .parseInt(matcher.group(4)));
- TimeInterval interval = new TimeInterval(begin, end);
- String description = "";
- String keywords = "";
-
- if (!SystemProperties.isNoProgramDetailsRequired()) {
- Element descriptionElem = action.getContent().element(
- "description");
- if (descriptionElem == null) {
- try {
- Page programInfo = action.execute();
- description = programInfo.getContent().element(
- "description").getText().trim();
- keywords = programInfo.getContent().element(
- "keywords").getText().trim();
- } catch (PageException e) {
- String msg = "Program details could not be determined for '"
- + action.getName() + "'";
- aReport.addMessage(msg, e);
- LOG.warn(msg, e);
- }
- } else {
- description = descriptionElem.getTextTrim();
- }
- }
- Program program = new Program(aChannel, action.getName(),
- description, keywords, interval, action);
-
- LOG.info("Got program " + program);
- programs.add(program);
- }
- }
- return new Channel(aChannel, programs);
- }
-}