2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss.main;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.ArrayList;
25 import java.util.List;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
29 import javax.mail.MessagingException;
31 import org.apache.commons.httpclient.HttpClient;
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.wamblee.crawler.Action;
35 import org.wamblee.crawler.Configuration;
36 import org.wamblee.crawler.Crawler;
37 import org.wamblee.crawler.Page;
38 import org.wamblee.crawler.PageException;
39 import org.wamblee.crawler.impl.ConfigurationParser;
40 import org.wamblee.crawler.impl.CrawlerImpl;
41 import org.wamblee.crawler.kiss.guide.Channel;
42 import org.wamblee.crawler.kiss.guide.PrintVisitor;
43 import org.wamblee.crawler.kiss.guide.Program;
44 import org.wamblee.crawler.kiss.guide.TVGuide;
45 import org.wamblee.crawler.kiss.guide.Time;
46 import org.wamblee.crawler.kiss.guide.TimeInterval;
47 import org.wamblee.crawler.kiss.notification.NotificationException;
48 import org.wamblee.crawler.kiss.notification.Notifier;
49 import org.wamblee.xml.ClasspathUriResolver;
50 import org.wamblee.xml.XslTransformer;
53 * The KiSS crawler for automatic recording of interesting TV shows.
56 public class KissCrawler {
58 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
61 * Start URL of the electronic programme guide.
63 private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
66 * Default socket timeout to use.
68 private static final int SOCKET_TIMEOUT = 20000;
71 * Regular expression for matching time interval strings in the retrieved
74 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
77 * Compiled pattern for the time regular expression.
79 private Pattern _pattern;
82 * Runs the KiSS crawler.
85 * Arguments, currently all ignored because they are hardcoded.
87 * In case of problems.
89 public static void main(String[] aArgs) throws Exception {
90 String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
91 String programConfig = new File(aArgs[1]).getCanonicalPath();
92 new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig);
96 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
97 * EPG guide, filters the guide for interesting programs, tries to record
98 * them, and sends a summary mail to the user.
100 * @param aCrawlerConfig
101 * Configuration file for the crawler.
102 * @param aProgramConfig
103 * Configuration file describing interesting shows.
104 * @throws IOException
105 * In case of problems reading files.
106 * @throws MessagingException
107 * In case of problems sending a mail notification.
109 public KissCrawler(String aCrawlerConfig,
110 String aProgramConfig) throws IOException, NotificationException {
111 this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig);
116 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
117 * EPG guide, filters the guide for interesting programs, tries to record
118 * them, and sends a summary mail to the user.
121 * Start URL of the electronic programme guide.
122 * @param aSocketTimeout Socket timeout to use.
123 * @param aCrawlerConfig
124 * Configuration file for the crawler.
125 * @param aProgramConfig
126 * Configuration file describing interesting shows.
127 * @throws IOException
128 * In case of problems reading files.
129 * @throws MessagingException
130 * In case of problems sending a mail notification.
132 public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig,
133 String aProgramConfig) throws IOException, NotificationException {
135 _pattern = Pattern.compile(TIME_REGEX);
138 HttpClient client = new HttpClient();
139 // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
140 client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT);
142 XslTransformer transformer = new XslTransformer(
143 new ClasspathUriResolver());
145 Crawler crawler = createCrawler(aCrawlerConfig, client, transformer);
146 InputStream programConfigFile = new FileInputStream(new File(
148 ProgramConfigurationParser parser = new ProgramConfigurationParser(
150 parser.parse(programConfigFile);
151 List<ProgramFilter> programFilters = parser.getFilters();
153 Report report = new Report();
156 Page page = getStartPage(aStartUrl, crawler, report);
157 TVGuide guide = createGuide(page, report);
158 PrintVisitor printer = new PrintVisitor(System.out);
159 guide.accept(printer);
160 processResults(programFilters, guide, parser.getNotifier(),
162 } catch (PageException e) {
163 report.addMessage("Problem getting TV guide", e);
164 LOG.info("Problem getting TV guide", e);
166 parser.getNotifier().send(report.asXml());
168 System.out.println("Crawler finished");
173 * Records interesting shows.
175 * @param aProgramCondition
176 * Condition determining which shows are interesting.
179 * @throws MessagingException
180 * In case of problems sending a summary mail.
182 private void processResults(List<ProgramFilter> aProgramCondition,
183 TVGuide aGuide, Notifier aNotifier, Report aReport) {
184 ProgramActionExecutor executor = new ProgramActionExecutor(aReport);
185 for (ProgramFilter filter : aProgramCondition) {
186 List<Program> programs = filter.apply(aGuide);
187 ProgramAction action = filter.getAction();
188 for (Program program : programs) {
189 action.execute(program, executor);
197 * Creates the crawler.
199 * @param aCrawlerConfig
200 * Crawler configuration file.
202 * Logging output stream for the crawler.
204 * HTTP Client to use.
206 * @throws FileNotFoundException
207 * In case configuration files cannot be found.
209 private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient,
210 XslTransformer aTransformer) throws FileNotFoundException {
211 ConfigurationParser parser = new ConfigurationParser(aTransformer);
212 InputStream crawlerConfigFile = new FileInputStream(new File(
214 Configuration config = parser.parse(crawlerConfigFile);
215 Crawler crawler = new CrawlerImpl(aClient, config);
220 * Gets the start page of the electronic programme guide. This involves
221 * login and navigation to a suitable start page after logging in.
224 * URL of the electronic programme guide.
229 * @return Starting page.
231 private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport)
232 throws PageException {
234 Page page = aCrawler.getPage(aStartUrl);
235 Action favorites = page.getAction("channels-favorites");
236 if (favorites == null) {
237 String msg = "Channels favorites action not found on start page";
238 throw new PageException(msg);
240 return favorites.execute();
241 } catch (PageException e) {
242 String msg = "Could not complete login to electronic programme guide.";
243 throw new PageException(msg, e);
248 * Creates the TV guide by web crawling.
256 private TVGuide createGuide(Page aPage, Report aReport) {
257 LOG.info("Obtaining full TV guide");
258 Action[] actions = aPage.getActions();
259 List<Channel> channels = new ArrayList<Channel>();
260 for (Action action : actions) {
262 LOG.info("Getting channel info for '" + action.getName() + "'");
263 Action rightNow = action.execute().getAction("right-now");
264 if (rightNow == null) {
265 throw new PageException("Channel summary page for '"
267 + "' does not contain required information");
269 Channel channel = createChannel(action.getName(), rightNow
270 .execute(), aReport);
271 channels.add(channel);
272 if (SystemProperties.isDebugMode()) {
273 break; // Only one channel is crawled.
275 } catch (PageException e) {
276 aReport.addMessage("Could not create channel information for '"
277 + action.getName() + "'");
278 LOG.error("Could not create channel information for '"
279 + action.getName() + "'", e);
282 return new TVGuide(channels);
286 * Create channel information for a specific channel.
291 * Starting page for the channel.
294 private Channel createChannel(String aChannel, Page aPage, Report aReport) {
295 LOG.info("Obtaining program for " + aChannel);
296 Action[] programActions = aPage.getActions();
297 List<Program> programs = new ArrayList<Program>();
298 for (Action action : programActions) {
299 String time = action.getContent().element("time").getText().trim();
300 Matcher matcher = _pattern.matcher(time);
301 if (matcher.matches()) {
302 Time begin = new Time(Integer.parseInt(matcher.group(1)),
303 Integer.parseInt(matcher.group(2)));
304 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
305 .parseInt(matcher.group(4)));
306 TimeInterval interval = new TimeInterval(begin, end);
307 String description = "";
308 String keywords = "";
309 if (!SystemProperties.isNoProgramDetailsRequired()) {
311 Page programInfo = action.execute();
312 description = programInfo.getContent().element(
313 "description").getText().trim();
314 keywords = programInfo.getContent().element("keywords")
316 } catch (PageException e) {
317 String msg = "Program details could not be determined for '"
318 + action.getName() + "'";
319 aReport.addMessage(msg, e);
323 Program program = new Program(aChannel, action.getName(),
324 description, keywords, interval, action);
326 LOG.info("Got program " + program);
327 programs.add(program);
330 return new Channel(aChannel, programs);