2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss.main;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.ArrayList;
25 import java.util.List;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
29 import javax.mail.MessagingException;
31 import org.apache.commons.httpclient.HttpClient;
32 import org.apache.commons.httpclient.NameValuePair;
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.dom4j.Element;
36 import org.wamblee.crawler.Action;
37 import org.wamblee.crawler.Configuration;
38 import org.wamblee.crawler.Crawler;
39 import org.wamblee.crawler.Page;
40 import org.wamblee.crawler.PageException;
41 import org.wamblee.crawler.impl.ConfigurationParser;
42 import org.wamblee.crawler.impl.CrawlerImpl;
43 import org.wamblee.crawler.kiss.guide.Channel;
44 import org.wamblee.crawler.kiss.guide.PrintVisitor;
45 import org.wamblee.crawler.kiss.guide.Program;
46 import org.wamblee.crawler.kiss.guide.TVGuide;
47 import org.wamblee.crawler.kiss.guide.Time;
48 import org.wamblee.crawler.kiss.guide.TimeInterval;
49 import org.wamblee.crawler.kiss.notification.NotificationException;
50 import org.wamblee.crawler.kiss.notification.Notifier;
51 import org.wamblee.general.BeanFactory;
52 import org.wamblee.xml.ClasspathUriResolver;
53 import org.wamblee.xml.XslTransformer;
56 * The KiSS crawler for automatic recording of interesting TV shows.
59 * @author Erik Brakkee
61 public class KissCrawler {
63 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
66 * Start URL of the electronic programme guide.
68 private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php";
71 * Default socket timeout to use.
73 private static final int SOCKET_TIMEOUT = 10000;
76 * Regular expression for matching time interval strings in the retrieved
79 private static final String TIME_REGEX = "[^0-9]*([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
82 * Compiled pattern for the time regular expression.
84 private Pattern _pattern;
87 * Runs the KiSS crawler.
90 * Arguments: First argument is the crawler configuration file,
91 * and second is the program configuration file.
93 * In case of problems.
95 public static void main(String[] aArgs) throws Exception {
96 String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
97 String programConfig = new File(aArgs[1]).getCanonicalPath();
99 BeanFactory factory = new StandaloneCrawlerBeanFactory();
100 Notifier notifier = factory.find(Notifier.class);
101 new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig,
102 programConfig, notifier, new Report());
106 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
107 * EPG guide, filters the guide for interesting programs, tries to record
108 * them, and sends a summary mail to the user.
110 * @param aCrawlerConfig
111 * Configuration file for the crawler.
112 * @param aProgramConfig
113 * Configuration file describing interesting shows.
115 * Object used to send notifications of the results.
118 * @throws IOException
119 * In case of problems reading files.
120 * @throws NotificationException
121 * In case notification fails.
122 * @throws PageException
123 * In case of problems retrieving the TV guide.
125 public KissCrawler(String aCrawlerConfig, String aProgramConfig,
126 Notifier aNotifier, Report aReport) throws IOException,
127 NotificationException, PageException {
128 this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig,
133 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
134 * EPG guide, filters the guide for interesting programs, tries to record
135 * them, and sends a summary mail to the user.
138 * Start URL of the electronic programme guide.
139 * @param aSocketTimeout
140 * Socket timeout to use.
141 * @param aCrawlerConfig
142 * Configuration file for the crawler.
143 * @param aProgramConfig
144 * Configuration file describing interesting shows.
146 * Object used to send notifications of the results.
149 * @throws IOException
150 * In case of problems reading files.
151 * @throws NotificationException
152 * In case notification fails.
153 * @throws PageException
154 * In case of problems retrieving the TV guide.
156 public KissCrawler(String aStartUrl, int aSocketTimeout,
157 String aCrawlerConfig, String aProgramConfig, Notifier aNotifier,
158 Report aReport) throws IOException, NotificationException,
161 _pattern = Pattern.compile(TIME_REGEX);
164 HttpClient client = new HttpClient();
165 // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
166 client.getParams().setParameter("http.socket.timeout",
169 XslTransformer transformer = new XslTransformer(
170 new ClasspathUriResolver());
172 Crawler crawler = createCrawler(aCrawlerConfig, client, transformer);
173 InputStream programConfigFile = new FileInputStream(new File(
175 ProgramConfigurationParser parser = new ProgramConfigurationParser();
176 parser.parse(programConfigFile);
177 List<ProgramFilter> programFilters = parser.getFilters();
180 Page page = getStartPage(aStartUrl, crawler, aReport);
181 TVGuide guide = createGuide(page, aReport);
182 PrintVisitor printer = new PrintVisitor(System.out);
183 guide.accept(printer);
184 processResults(programFilters, guide, aNotifier, aReport);
185 } catch (PageException e) {
186 aReport.addMessage("Problem getting TV guide", e);
187 LOG.info("Problem getting TV guide", e);
190 aNotifier.send(aReport.asXml());
192 System.out.println("Crawler finished");
197 * Records interesting shows.
199 * @param aProgramCondition
200 * Condition determining which shows are interesting.
203 * @throws MessagingException
204 * In case of problems sending a summary mail.
206 private void processResults(List<ProgramFilter> aProgramCondition,
207 TVGuide aGuide, Notifier aNotifier, Report aReport) {
208 ProgramActionExecutor executor = new ProgramActionExecutor(aReport);
209 for (ProgramFilter filter : aProgramCondition) {
210 List<Program> programs = filter.apply(aGuide);
211 ProgramAction action = filter.getAction();
212 for (Program program : programs) {
213 action.execute(program, executor);
221 * Creates the crawler.
223 * @param aCrawlerConfig
224 * Crawler configuration file.
226 * Logging output stream for the crawler.
228 * HTTP Client to use.
230 * @throws FileNotFoundException
231 * In case configuration files cannot be found.
233 private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient,
234 XslTransformer aTransformer) throws FileNotFoundException {
235 ConfigurationParser parser = new ConfigurationParser(aTransformer);
236 InputStream crawlerConfigFile = new FileInputStream(new File(
238 Configuration config = parser.parse(crawlerConfigFile);
239 Crawler crawler = new CrawlerImpl(aClient, config);
244 * Gets the start page of the electronic programme guide. This involves
245 * login and navigation to a suitable start page after logging in.
248 * URL of the electronic programme guide.
253 * @return Starting page.
255 private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport)
256 throws PageException {
258 Page page = aCrawler.getPage(aStartUrl, new NameValuePair[0]);
259 page = page.getAction("login").execute();
260 Action favorites = page.getAction("channels-favorites");
261 if (favorites == null) {
262 String msg = "Channels favorites action not found on start page";
263 throw new PageException(msg);
265 return favorites.execute();
266 } catch (PageException e) {
267 String msg = "Could not complete login to electronic programme guide.";
268 throw new PageException(msg, e);
273 * Creates the TV guide by web crawling.
280 * @throws PageException
281 * In case of problem getting the tv guide.
283 private TVGuide createGuide(Page aPage, Report aReport)
284 throws PageException {
285 LOG.info("Obtaining full TV guide");
286 Action[] actions = aPage.getActions();
287 if (actions.length == 0) {
288 LOG.error("No channels found");
289 throw new PageException("No channels found");
291 List<Channel> channels = new ArrayList<Channel>();
292 for (Action action : actions) {
294 LOG.info("Getting channel info for '" + action.getName() + "'");
295 Action tomorrow = action.execute().getAction("tomorrow");
296 if (tomorrow == null) {
297 throw new PageException("Channel summary page for '"
299 + "' does not contain required information");
301 Channel channel = createChannel(action.getName(), tomorrow
302 .execute(), aReport);
303 channels.add(channel);
304 if (SystemProperties.isDebugMode()) {
305 break; // Only one channel is crawled.
307 } catch (PageException e) {
308 aReport.addMessage("Could not create channel information for '"
309 + action.getName() + "'");
310 LOG.error("Could not create channel information for '"
311 + action.getName() + "'", e);
314 return new TVGuide(channels);
318 * Create channel information for a specific channel.
323 * Starting page for the channel.
326 private Channel createChannel(String aChannel, Page aPage, Report aReport) {
327 LOG.info("Obtaining program for " + aChannel);
328 Action[] programActions = aPage.getActions();
329 List<Program> programs = new ArrayList<Program>();
330 for (Action action : programActions) {
331 String time = action.getContent().element("time").getText().trim();
332 Matcher matcher = _pattern.matcher(time);
333 if (matcher.matches()) {
334 Time begin = new Time(Integer.parseInt(matcher.group(1)),
335 Integer.parseInt(matcher.group(2)));
336 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
337 .parseInt(matcher.group(4)));
338 TimeInterval interval = new TimeInterval(begin, end);
339 String description = "";
340 String keywords = "";
342 if (!SystemProperties.isNoProgramDetailsRequired()) {
343 Element descriptionElem = action.getContent().element(
345 if (descriptionElem == null) {
347 Page programInfo = action.execute();
348 description = programInfo.getContent().element(
349 "description").getText().trim();
350 keywords = programInfo.getContent().element(
351 "keywords").getText().trim();
352 } catch (PageException e) {
353 String msg = "Program details could not be determined for '"
354 + action.getName() + "'";
355 aReport.addMessage(msg, e);
359 description = descriptionElem.getTextTrim();
362 Program program = new Program(aChannel, action.getName(),
363 description, keywords, interval, action);
365 LOG.info("Got program " + program);
366 programs.add(program);
369 return new Channel(aChannel, programs);