2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss.main;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PrintStream;
26 import java.util.ArrayList;
27 import java.util.List;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
31 import javax.mail.MessagingException;
33 import org.apache.commons.httpclient.HttpClient;
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.wamblee.crawler.Action;
37 import org.wamblee.crawler.Configuration;
38 import org.wamblee.crawler.Crawler;
39 import org.wamblee.crawler.Page;
40 import org.wamblee.crawler.PageException;
41 import org.wamblee.crawler.impl.ConfigurationParser;
42 import org.wamblee.crawler.impl.CrawlerImpl;
43 import org.wamblee.crawler.kiss.guide.Channel;
44 import org.wamblee.crawler.kiss.guide.PrintVisitor;
45 import org.wamblee.crawler.kiss.guide.Program;
46 import org.wamblee.crawler.kiss.guide.TVGuide;
47 import org.wamblee.crawler.kiss.guide.Time;
48 import org.wamblee.crawler.kiss.guide.TimeInterval;
49 import org.wamblee.crawler.kiss.notification.NotificationException;
50 import org.wamblee.crawler.kiss.notification.Notifier;
53 * The KiSS crawler for automatic recording of interesting TV shows.
56 public class KissCrawler {
58 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
61 * Log file name for the crawler.
63 private static final String LOG_FILE = "kiss.log";
66 * Start URL of the electronic programme guide.
68 private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
71 * Crawler configuration file.
73 private static final String CRAWLER_CONFIG = "config.xml";
76 * Configuration file describing interesting programs.
78 private static final String PROGRAM_CONFIG = "programs.xml";
81 * Regular expression for matching time interval strings in the retrieved
84 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
87 * Compiled pattern for the time regular expression.
89 private Pattern _pattern;
92 * Runs the KiSS crawler.
95 * Arguments, currently all ignored because they are hardcoded.
97 * In case of problems.
99 public static void main(String[] aArgs) throws Exception {
100 new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
104 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
105 * EPG guide, filters the guide for interesting programs, tries to record
106 * them, and sends a summary mail to the user.
109 * Start URL of the electronic programme guide.
110 * @param aCrawlerConfig
111 * Configuration file for the crawler.
112 * @param aProgramConfig
113 * Configuration file describing interesting shows.
114 * @throws IOException
115 * In case of problems reading files.
116 * @throws MessagingException
117 * In case of problems sending a mail notification.
119 public KissCrawler(String aStartUrl, String aCrawlerConfig,
120 String aProgramConfig) throws IOException, MessagingException {
122 _pattern = Pattern.compile(TIME_REGEX);
124 FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
125 PrintStream os = new PrintStream(fos);
128 HttpClient client = new HttpClient();
129 // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
131 Crawler crawler = createCrawler(aCrawlerConfig, os, client);
132 InputStream programConfigFile = new FileInputStream(new File(
134 ProgramConfigurationParser parser = new ProgramConfigurationParser();
135 parser.parse(programConfigFile);
136 List<ProgramFilter> programFilters = parser.getFilters();
138 Page page = getStartPage(aStartUrl, crawler);
139 TVGuide guide = createGuide(page);
140 PrintVisitor printer = new PrintVisitor(System.out);
141 guide.accept(printer);
142 processResults(programFilters, guide, parser.getNotifier());
146 System.out.println("Output written on '" + LOG_FILE + "'");
151 * Records interesting shows.
153 * @param aProgramCondition
154 * Condition determining which shows are interesting.
157 * @throws MessagingException
158 * In case of problems sending a summary mail.
160 private void processResults(List<ProgramFilter> aProgramCondition,
161 TVGuide aGuide, Notifier aNotifier) throws MessagingException {
162 ProgramActionExecutor executor = new ProgramActionExecutor();
163 for (ProgramFilter filter : aProgramCondition) {
164 List<Program> programs = filter.apply(aGuide);
165 ProgramAction action = filter.getAction();
166 for (Program program : programs) {
167 action.execute(program, executor);
172 aNotifier.send(executor.getReport());
173 } catch (NotificationException e) {
174 throw new RuntimeException(e);
179 * Creates the crawler.
181 * @param aCrawlerConfig
182 * Crawler configuration file.
184 * Logging output stream for the crawler.
186 * HTTP Client to use.
188 * @throws FileNotFoundException
189 * In case configuration files cannot be found.
191 private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs,
192 HttpClient aClient) throws FileNotFoundException {
193 ConfigurationParser parser = new ConfigurationParser(aOs);
194 InputStream crawlerConfigFile = new FileInputStream(new File(
196 Configuration config = parser.parse(crawlerConfigFile);
197 Crawler crawler = new CrawlerImpl(aClient, config);
202 * Gets the start page of the electronic programme guide. This involves
203 * login and navigation to a suitable start page after logging in.
206 * URL of the electronic programme guide.
209 * @return Starting page.
211 private Page getStartPage(String aStartUrl, Crawler aCrawler) {
213 Page page = aCrawler.getPage(aStartUrl);
214 return page.getAction("channels-favorites").execute();
215 } catch (PageException e) {
216 throw new RuntimeException(
217 "Could not login to electronic program guide", e);
222 * Creates the TV guide by web crawling.
228 private TVGuide createGuide(Page aPage) {
229 LOG.info("Obtaining full TV guide");
230 Action[] actions = aPage.getActions();
231 List<Channel> channels = new ArrayList<Channel>();
232 for (Action action : actions) {
234 LOG.info("Getting channel info for '" + action.getName() + "'");
235 Channel channel = createChannel(action.getName(), action
236 .execute().getAction("right-now").execute());
237 channels.add(channel);
238 if (SystemProperties.isDebugMode()) {
239 break; // Only one channel is crawled.
241 } catch (PageException e) {
242 LOG.error("Could not create channel information for '"
243 + action.getName() + "'", e);
246 return new TVGuide(channels);
250 * Create channel information for a specific channel.
255 * Starting page for the channel.
258 private Channel createChannel(String aChannel, Page aPage) {
259 LOG.info("Obtaining program for " + aChannel);
260 Action[] programActions = aPage.getActions();
261 List<Program> programs = new ArrayList<Program>();
262 for (Action action : programActions) {
263 String time = action.getContent().element("time").getText().trim();
264 Matcher matcher = _pattern.matcher(time);
265 if (matcher.matches()) {
266 Time begin = new Time(Integer.parseInt(matcher.group(1)),
267 Integer.parseInt(matcher.group(2)));
268 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
269 .parseInt(matcher.group(4)));
270 TimeInterval interval = new TimeInterval(begin, end);
271 String description = "";
272 String keywords = "";
273 if (!SystemProperties.isNoProgramDetailsRequired()) {
275 Page programInfo = action.execute();
276 description = programInfo.getContent().element(
277 "description").getText().trim();
278 keywords = programInfo.getContent().element("keywords")
280 } catch (PageException e) {
282 "Program details could not be determined for '"
283 + action.getName() + "'", e);
286 Program program = new Program(aChannel, action.getName(),
287 description, keywords, interval, action);
289 LOG.info("Got program " + program);
290 programs.add(program);
293 return new Channel(aChannel, programs);