2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss.main;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.ArrayList;
25 import java.util.List;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
29 import javax.mail.MessagingException;
31 import org.apache.commons.httpclient.HttpClient;
32 import org.apache.commons.httpclient.NameValuePair;
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.wamblee.crawler.Action;
36 import org.wamblee.crawler.Configuration;
37 import org.wamblee.crawler.Crawler;
38 import org.wamblee.crawler.Page;
39 import org.wamblee.crawler.PageException;
40 import org.wamblee.crawler.impl.ConfigurationParser;
41 import org.wamblee.crawler.impl.CrawlerImpl;
42 import org.wamblee.crawler.kiss.guide.Channel;
43 import org.wamblee.crawler.kiss.guide.PrintVisitor;
44 import org.wamblee.crawler.kiss.guide.Program;
45 import org.wamblee.crawler.kiss.guide.TVGuide;
46 import org.wamblee.crawler.kiss.guide.Time;
47 import org.wamblee.crawler.kiss.guide.TimeInterval;
48 import org.wamblee.crawler.kiss.notification.NotificationException;
49 import org.wamblee.crawler.kiss.notification.Notifier;
50 import org.wamblee.general.BeanFactory;
51 import org.wamblee.xml.ClasspathUriResolver;
52 import org.wamblee.xml.XslTransformer;
55 * The KiSS crawler for automatic recording of interesting TV shows.
58 public class KissCrawler {
60 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
63 * Start URL of the electronic programme guide.
65 private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php";
68 * Default socket timeout to use.
70 private static final int SOCKET_TIMEOUT = 10000;
73 * Regular expression for matching time interval strings in the retrieved
76 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
79 * Compiled pattern for the time regular expression.
81 private Pattern _pattern;
84 * Runs the KiSS crawler.
87 * Arguments, currently all ignored because they are hardcoded.
89 * In case of problems.
91 public static void main(String[] aArgs) throws Exception {
92 String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
93 String programConfig = new File(aArgs[1]).getCanonicalPath();
95 BeanFactory factory = new StandaloneCrawlerBeanFactory();
96 Notifier notifier = factory.find(Notifier.class);
97 new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig, notifier, new Report());
101 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
102 * EPG guide, filters the guide for interesting programs, tries to record
103 * them, and sends a summary mail to the user.
105 * @param aCrawlerConfig
106 * Configuration file for the crawler.
107 * @param aProgramConfig
108 * Configuration file describing interesting shows.
109 * @param aNotifier Object used to send notifications of the results.
110 * @param aReport Report to use.
111 * @throws IOException
112 * In case of problems reading files.
113 * @throws NotificationException In case notification fails.
114 * @throws PageException In case of problems retrieving the TV guide.
116 public KissCrawler(String aCrawlerConfig,
117 String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException {
118 this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, aNotifier, aReport);
123 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
124 * EPG guide, filters the guide for interesting programs, tries to record
125 * them, and sends a summary mail to the user.
128 * Start URL of the electronic programme guide.
129 * @param aSocketTimeout Socket timeout to use.
130 * @param aCrawlerConfig
131 * Configuration file for the crawler.
132 * @param aProgramConfig
133 * Configuration file describing interesting shows.
134 * @param aNotifier Object used to send notifications of the results.
135 * @param aReport Report to use.
136 * @throws IOException
137 * In case of problems reading files.
138 * @throws NotificationException In case notification fails.
139 * @throws PageException In case of problems retrieving the TV guide.
141 public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig,
142 String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException {
144 _pattern = Pattern.compile(TIME_REGEX);
147 HttpClient client = new HttpClient();
148 // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
149 client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT);
151 XslTransformer transformer = new XslTransformer(
152 new ClasspathUriResolver());
154 Crawler crawler = createCrawler(aCrawlerConfig, client, transformer);
155 InputStream programConfigFile = new FileInputStream(new File(
157 ProgramConfigurationParser parser = new ProgramConfigurationParser();
158 parser.parse(programConfigFile);
159 List<ProgramFilter> programFilters = parser.getFilters();
162 Page page = getStartPage(aStartUrl, crawler, aReport);
163 TVGuide guide = createGuide(page, aReport);
164 PrintVisitor printer = new PrintVisitor(System.out);
165 guide.accept(printer);
166 processResults(programFilters, guide, aNotifier,
168 } catch (PageException e) {
169 aReport.addMessage("Problem getting TV guide", e);
170 LOG.info("Problem getting TV guide", e);
173 aNotifier.send(aReport.asXml());
175 System.out.println("Crawler finished");
180 * Records interesting shows.
182 * @param aProgramCondition
183 * Condition determining which shows are interesting.
186 * @throws MessagingException
187 * In case of problems sending a summary mail.
189 private void processResults(List<ProgramFilter> aProgramCondition,
190 TVGuide aGuide, Notifier aNotifier, Report aReport) {
191 ProgramActionExecutor executor = new ProgramActionExecutor(aReport);
192 for (ProgramFilter filter : aProgramCondition) {
193 List<Program> programs = filter.apply(aGuide);
194 ProgramAction action = filter.getAction();
195 for (Program program : programs) {
196 action.execute(program, executor);
204 * Creates the crawler.
206 * @param aCrawlerConfig
207 * Crawler configuration file.
209 * Logging output stream for the crawler.
211 * HTTP Client to use.
213 * @throws FileNotFoundException
214 * In case configuration files cannot be found.
216 private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient,
217 XslTransformer aTransformer) throws FileNotFoundException {
218 ConfigurationParser parser = new ConfigurationParser(aTransformer);
219 InputStream crawlerConfigFile = new FileInputStream(new File(
221 Configuration config = parser.parse(crawlerConfigFile);
222 Crawler crawler = new CrawlerImpl(aClient, config);
227 * Gets the start page of the electronic programme guide. This involves
228 * login and navigation to a suitable start page after logging in.
231 * URL of the electronic programme guide.
236 * @return Starting page.
238 private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport)
239 throws PageException {
241 Page page = aCrawler.getPage(aStartUrl, new NameValuePair[0]);
242 page = page.getAction("login").execute();
243 Action favorites = page.getAction("channels-favorites");
244 if (favorites == null) {
245 String msg = "Channels favorites action not found on start page";
246 throw new PageException(msg);
248 return favorites.execute();
249 } catch (PageException e) {
250 String msg = "Could not complete login to electronic programme guide.";
251 throw new PageException(msg, e);
256 * Creates the TV guide by web crawling.
263 * @throws PageException In case of problem getting the tv guide.
265 private TVGuide createGuide(Page aPage, Report aReport) throws PageException {
266 LOG.info("Obtaining full TV guide");
267 Action[] actions = aPage.getActions();
268 if ( actions.length == 0 ) {
269 LOG.error("No channels found");
270 throw new PageException("No channels found");
272 List<Channel> channels = new ArrayList<Channel>();
273 for (Action action : actions) {
275 LOG.info("Getting channel info for '" + action.getName() + "'");
276 Action rightNow = action.execute().getAction("right-now");
277 if (rightNow == null) {
278 throw new PageException("Channel summary page for '"
280 + "' does not contain required information");
282 Channel channel = createChannel(action.getName(), rightNow
283 .execute(), aReport);
284 channels.add(channel);
285 if (SystemProperties.isDebugMode()) {
286 break; // Only one channel is crawled.
288 } catch (PageException e) {
289 aReport.addMessage("Could not create channel information for '"
290 + action.getName() + "'");
291 LOG.error("Could not create channel information for '"
292 + action.getName() + "'", e);
295 return new TVGuide(channels);
299 * Create channel information for a specific channel.
304 * Starting page for the channel.
307 private Channel createChannel(String aChannel, Page aPage, Report aReport) {
308 LOG.info("Obtaining program for " + aChannel);
309 Action[] programActions = aPage.getActions();
310 List<Program> programs = new ArrayList<Program>();
311 for (Action action : programActions) {
312 String time = action.getContent().element("time").getText().trim();
313 Matcher matcher = _pattern.matcher(time);
314 if (matcher.matches()) {
315 Time begin = new Time(Integer.parseInt(matcher.group(1)),
316 Integer.parseInt(matcher.group(2)));
317 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
318 .parseInt(matcher.group(4)));
319 TimeInterval interval = new TimeInterval(begin, end);
320 String description = "";
321 String keywords = "";
322 if (!SystemProperties.isNoProgramDetailsRequired()) {
324 Page programInfo = action.execute();
325 description = programInfo.getContent().element(
326 "description").getText().trim();
327 keywords = programInfo.getContent().element("keywords")
329 } catch (PageException e) {
330 String msg = "Program details could not be determined for '"
331 + action.getName() + "'";
332 aReport.addMessage(msg, e);
336 Program program = new Program(aChannel, action.getName(),
337 description, keywords, interval, action);
339 LOG.info("Got program " + program);
340 programs.add(program);
343 return new Channel(aChannel, programs);