2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss.main;
19 import java.io.ByteArrayOutputStream;
21 import java.io.FileInputStream;
22 import java.io.FileNotFoundException;
23 import java.io.FileOutputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.PrintStream;
27 import java.util.ArrayList;
28 import java.util.Arrays;
29 import java.util.Date;
30 import java.util.List;
31 import java.util.Properties;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
35 import javax.mail.MessagingException;
36 import javax.mail.Session;
37 import javax.mail.internet.InternetAddress;
38 import javax.xml.transform.TransformerException;
40 import org.apache.commons.httpclient.HttpClient;
41 import org.apache.commons.logging.Log;
42 import org.apache.commons.logging.LogFactory;
43 import org.apache.commons.mail.EmailException;
44 import org.apache.commons.mail.HtmlEmail;
45 import org.apache.xml.serialize.OutputFormat;
46 import org.apache.xml.serialize.XMLSerializer;
47 import org.w3c.dom.Document;
48 import org.wamblee.crawler.Action;
49 import org.wamblee.crawler.Configuration;
50 import org.wamblee.crawler.Crawler;
51 import org.wamblee.crawler.Page;
52 import org.wamblee.crawler.PageException;
53 import org.wamblee.crawler.impl.ConfigurationParser;
54 import org.wamblee.crawler.impl.CrawlerImpl;
55 import org.wamblee.crawler.kiss.guide.Channel;
56 import org.wamblee.crawler.kiss.guide.PrintVisitor;
57 import org.wamblee.crawler.kiss.guide.Program;
58 import org.wamblee.crawler.kiss.guide.TVGuide;
59 import org.wamblee.crawler.kiss.guide.Time;
60 import org.wamblee.crawler.kiss.guide.TimeInterval;
61 import org.wamblee.crawler.kiss.notification.NotificationException;
62 import org.wamblee.crawler.kiss.notification.Notifier;
63 import org.wamblee.io.FileResource;
64 import org.wamblee.xml.XSLT;
67 * The KiSS crawler for automatic recording of interesting TV shows.
70 public class KissCrawler {
72 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
75 * Log file name for the crawler.
77 private static final String LOG_FILE = "kiss.log";
80 * Start URL of the electronic programme guide.
82 private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
85 * Crawler configuration file.
87 private static final String CRAWLER_CONFIG = "config.xml";
90 * Configuration file describing interesting programs.
92 private static final String PROGRAM_CONFIG = "programs.xml";
95 * Regular expression for matching time interval strings in the retrieved
98 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
101 * Compiled pattern for the time regular expression.
103 private Pattern _pattern;
106 * Runs the KiSS crawler.
109 * Arguments, currently all ignored because they are hardcoded.
111 * In case of problems.
113 public static void main(String[] aArgs) throws Exception {
114 new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
118 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
119 * EPG guide, filters the guide for interesting programs, tries to record
120 * them, and sends a summary mail to the user.
123 * Start URL of the electronic programme guide.
124 * @param aCrawlerConfig
125 * Configuration file for the crawler.
126 * @param aProgramConfig
127 * Configuration file describing interesting shows.
128 * @throws IOException
129 * In case of problems reading files.
130 * @throws MessagingException
131 * In case of problems sending a mail notification.
133 public KissCrawler(String aStartUrl, String aCrawlerConfig,
134 String aProgramConfig) throws IOException, MessagingException {
136 _pattern = Pattern.compile(TIME_REGEX);
138 FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
139 PrintStream os = new PrintStream(fos);
142 HttpClient client = new HttpClient();
143 //client.getHostConfiguration().setProxy("127.0.0.1", 3128);
145 Crawler crawler = createCrawler(aCrawlerConfig, os, client);
146 InputStream programConfigFile = new FileInputStream(new File(
148 ProgramConfigurationParser parser = new ProgramConfigurationParser();
149 parser.parse(programConfigFile);
150 List<ProgramFilter> programFilters = parser.getFilters();
152 Page page = getStartPage(aStartUrl, crawler);
153 TVGuide guide = createGuide(page);
154 PrintVisitor printer = new PrintVisitor(System.out);
155 guide.accept(printer);
156 processResults(programFilters, guide, parser.getNotifier());
160 System.out.println("Output written on '" + LOG_FILE + "'");
165 * Records interesting shows.
167 * @param aProgramCondition
168 * Condition determining which shows are interesting.
171 * @throws MessagingException
172 * In case of problems sending a summary mail.
174 private void processResults(List<ProgramFilter> aProgramCondition,
175 TVGuide aGuide, Notifier aNotifier) throws MessagingException {
176 ProgramActionExecutor executor = new ProgramActionExecutor();
177 for (ProgramFilter filter : aProgramCondition) {
178 List<Program> programs = filter.apply(aGuide);
179 ProgramAction action = filter.getAction();
180 for (Program program : programs) {
181 action.execute(program, executor);
186 aNotifier.send(executor.getXmlReport());
187 } catch (NotificationException e) {
188 throw new RuntimeException(e);
193 * Creates the crawler.
195 * @param aCrawlerConfig
196 * Crawler configuration file.
198 * Logging output stream for the crawler.
200 * HTTP Client to use.
202 * @throws FileNotFoundException
203 * In case configuration files cannot be found.
205 private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs,
206 HttpClient aClient) throws FileNotFoundException {
207 ConfigurationParser parser = new ConfigurationParser(aOs);
208 InputStream crawlerConfigFile = new FileInputStream(new File(
210 Configuration config = parser.parse(crawlerConfigFile);
211 Crawler crawler = new CrawlerImpl(aClient, config);
216 * Gets the start page of the electronic programme guide. This involves
217 * login and navigation to a suitable start page after logging in.
220 * URL of the electronic programme guide.
223 * @return Starting page.
225 private Page getStartPage(String aStartUrl, Crawler aCrawler) {
227 Page page = aCrawler.getPage(aStartUrl);
228 return page.getAction("channels-favorites").execute();
229 } catch (PageException e) {
230 throw new RuntimeException(
231 "Could not login to electronic program guide", e);
236 * Creates the TV guide by web crawling.
242 private TVGuide createGuide(Page aPage) {
243 LOG.info("Obtaining full TV guide");
244 Action[] actions = aPage.getActions();
245 List<Channel> channels = new ArrayList<Channel>();
246 for (Action action : actions) {
248 LOG.info("Getting channel info for '" + action.getName() + "'");
249 Channel channel = createChannel(action.getName(), action
250 .execute().getAction("right-now").execute());
251 channels.add(channel);
252 if (SystemProperties.isDebugMode()) {
253 break; // Only one channel is crawled.
255 } catch (PageException e) {
256 LOG.error("Could not create channel information for '"
257 + action.getName() + "'", e);
260 return new TVGuide(channels);
264 * Create channel information for a specific channel.
269 * Starting page for the channel.
272 private Channel createChannel(String aChannel, Page aPage) {
273 LOG.info("Obtaining program for " + aChannel);
274 Action[] programActions = aPage.getActions();
275 List<Program> programs = new ArrayList<Program>();
276 for (Action action : programActions) {
277 String time = action.getContent().element("time").getText().trim();
278 Matcher matcher = _pattern.matcher(time);
279 if (matcher.matches()) {
280 Time begin = new Time(Integer.parseInt(matcher.group(1)),
281 Integer.parseInt(matcher.group(2)));
282 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
283 .parseInt(matcher.group(4)));
284 TimeInterval interval = new TimeInterval(begin, end);
285 String description = "";
286 String keywords = "";
287 if (!SystemProperties.isNoProgramDetailsRequired()) {
289 Page programInfo = action.execute();
290 description = programInfo.getContent().element(
291 "description").getText().trim();
292 keywords = programInfo.getContent().element("keywords")
294 } catch (PageException e) {
296 "Program details could not be determined for '"
297 + action.getName() + "'", e);
300 Program program = new Program(aChannel, action.getName(),
301 description, keywords, interval, action);
303 LOG.info("Got program " + program);
304 programs.add(program);
307 return new Channel(aChannel, programs);
311 * Sends a summary mail to the user.
315 * @throws MessagingException
316 * In case of problems sending mail.
318 private void sendMail(ProgramActionExecutor aExecutor) throws MessagingException {
319 String textReport = aExecutor.getReport();
320 System.out.println("Text report: \n" + textReport);
321 System.out.println("XML report:\n" + aExecutor.getXmlReport().asXML());
324 Properties props = new Properties();
325 props.put("mail.transport.protocol", "smtp");
326 props.put("mail.smtp.host", "falcon");
327 props.put("mail.smtp.port", "25");
329 Session mailSession = Session.getInstance(props);
330 InternetAddress from = new InternetAddress("erik@brakkee.org");
332 HtmlEmail mail = new HtmlEmail();
333 mail.setMailSession(mailSession);
335 mail.setFrom("erik@brakkee.org");
336 mail.setTo(Arrays.asList(new InternetAddress[] { from }));
337 mail.setSentDate(new Date());
338 mail.setSubject("KiSS Crawler Update");
339 String html = aExecutor.getXmlReport().asXML();
340 Document document = new XSLT().transform(html.getBytes(), new FileResource(new File("reportToHtml.xsl")));
341 ByteArrayOutputStream xhtml = new ByteArrayOutputStream();
342 XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat());
343 serializer.serialize(document);
344 mail.setHtmlMsg(xhtml.toString());
345 mail.setTextMsg(textReport);
347 } catch (EmailException e) {
348 throw new RuntimeException(e);
349 } catch (TransformerException e) {
350 throw new RuntimeException(e);
351 } catch (IOException e) {
352 throw new RuntimeException(e);