2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PrintStream;
26 import java.util.ArrayList;
27 import java.util.Date;
28 import java.util.List;
29 import java.util.Properties;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
33 import javax.mail.Message;
34 import javax.mail.MessagingException;
35 import javax.mail.Session;
36 import javax.mail.Transport;
37 import javax.mail.internet.InternetAddress;
38 import javax.mail.internet.MimeMessage;
40 import org.apache.commons.httpclient.HttpClient;
41 import org.apache.commons.logging.Log;
42 import org.apache.commons.logging.LogFactory;
43 import org.dom4j.Element;
44 import org.wamblee.conditions.Condition;
45 import org.wamblee.crawler.Action;
46 import org.wamblee.crawler.Configuration;
47 import org.wamblee.crawler.Crawler;
48 import org.wamblee.crawler.Page;
49 import org.wamblee.crawler.PageException;
50 import org.wamblee.crawler.impl.ConfigurationParser;
51 import org.wamblee.crawler.impl.CrawlerImpl;
54 * The KiSS crawler for automatic recording of interesting TV shows.
57 public class KissCrawler {
59 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
62 * Log file name for the crawler.
64 private static final String LOG_FILE = "kiss.log";
67 * Start URL of the electronic programme guide.
69 private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
72 * Crawler configuration file.
74 private static final String CRAWLER_CONFIG = "config.xml";
77 * Configuration file describing interesting programs.
79 private static final String PROGRAM_CONFIG = "programs.xml";
82 * Regular expression for matching time interval strings in the
85 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
88 * Compiled pattern for the time regular expression.
90 private Pattern _pattern;
93 * Runs the KiSS crawler.
94 * @param aArgs Arguments, currently all ignored because they are hardcoded.
95 * @throws Exception In case of problems.
97 public static void main(String[] aArgs) throws Exception {
98 new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
102 * Constructs the crawler. This retrieves the TV guide by crawling the
103 * KiSS EPG guide, filters the guide for interesting programs, tries to
104 * record them, and sends a summary mail to the user.
105 * @param aStartUrl Start URL of the electronic programme guide.
106 * @param aCrawlerConfig Configuration file for the crawler.
107 * @param aProgramConfig Configuration file describing interesting shows.
108 * @throws IOException In case of problems reading files.
109 * @throws MessagingException In case of problems sending a mail notification.
111 public KissCrawler(String aStartUrl, String aCrawlerConfig,
112 String aProgramConfig) throws IOException, MessagingException {
114 _pattern = Pattern.compile(TIME_REGEX);
116 FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
117 PrintStream os = new PrintStream(fos);
120 HttpClient client = new HttpClient();
121 // client.getHostConfiguration().setProxy("localhost", 3128);
123 Crawler crawler = createCrawler(aCrawlerConfig, os, client);
125 Page page = getStartPage(aStartUrl, crawler);
126 TVGuide guide = createGuide(page);
127 PrintVisitor printer = new PrintVisitor(System.out);
128 guide.accept(printer);
130 InputStream programConfigFile = new FileInputStream(new File(
132 Condition<Program> programCondition = new ProgramConfigurationParser()
133 .parse(programConfigFile);
134 recordInterestingShows(programCondition, guide);
138 System.out.println("Output written on '" + LOG_FILE + "'");
143 * Records interesting shows.
144 * @param aProgramCondition Condition determining which shows are interesting.
145 * @param aGuide Television guide.
146 * @throws MessagingException In case of problems sending a summary mail.
148 private void recordInterestingShows(Condition<Program> aProgramCondition,
149 TVGuide aGuide) throws MessagingException {
150 MatchVisitor matcher = new MatchVisitor(aProgramCondition);
151 aGuide.accept(matcher);
152 List<Program> programs = matcher.getMatches();
153 String recorded = "";
154 String notRecorded = "";
155 String failures = "";
156 for (Program program : programs) {
158 boolean result = program.record();
160 recorded += "\n" + program;
162 notRecorded += "\n" + program;
164 } catch (PageException e) {
165 LOG.info("Attempt to record " + program + " failed.");
166 failures += "\n" + program.toString() + ": " + e.getMessage();
169 String msg = "Summary of KiSS crawler: \n\n\n";
171 if (recorded.length() > 0) {
172 msg += "Recorded programs:\n\n" + recorded + "\n\n";
174 if (notRecorded.length() > 0) {
175 msg += "Not recorded programs:\n\n" + notRecorded + "\n\n";
177 if (recorded.length() == 0 && notRecorded.length() == 0) {
178 msg += "No suitable programs found";
180 if (failures.length() > 0) {
181 msg += "Failures:\n\n" + failures;
183 System.out.println(msg);
188 * Creates the crawler.
189 * @param aCrawlerConfig Crawler configuration file.
190 * @param aOs Logging output stream for the crawler.
191 * @param aClient HTTP Client to use.
193 * @throws FileNotFoundException In case configuration files cannot be found.
195 private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs,
196 HttpClient aClient) throws FileNotFoundException {
197 ConfigurationParser parser = new ConfigurationParser(aOs);
198 InputStream crawlerConfigFile = new FileInputStream(new File(
200 Configuration config = parser.parse(crawlerConfigFile);
201 Crawler crawler = new CrawlerImpl(aClient, config);
206 * Gets the start page of the electronic programme guide. This involves login and
207 * navigation to a suitable start page after logging in.
208 * @param aStartUrl URL of the electronic programme guide.
209 * @param aCrawler Crawler to use.
210 * @return Starting page.
212 private Page getStartPage(String aStartUrl, Crawler aCrawler) {
214 Page page = aCrawler.getPage(aStartUrl);
215 return page.getAction("channels-favorites").execute();
216 } catch (PageException e) {
217 throw new RuntimeException(
218 "Could not login to electronic program guide", e);
223 * Creates the TV guide by web crawling.
224 * @param aPage Starting page.
227 private TVGuide createGuide(Page aPage) {
228 LOG.info("Obtaining full TV guide");
229 Action[] actions = aPage.getActions();
230 List<Channel> channels = new ArrayList<Channel>();
231 for (Action action : actions) {
233 LOG.info("Getting channel info for '" + action.getName() + "'");
234 Channel channel = createChannel(action.getName(), action
235 .execute().getAction("right-now").execute());
236 channels.add(channel);
237 } catch (PageException e) {
238 LOG.error("Could not create channel information for '"
239 + action.getName() + "'", e);
242 return new TVGuide(channels);
246 * Create channel information for a specific channel.
247 * @param aChannel Channel name.
248 * @param aPage Starting page for the channel.
251 private Channel createChannel(String aChannel, Page aPage) {
252 LOG.info("Obtaining program for " + aChannel);
253 Action[] programActions = aPage.getActions();
254 List<Program> programs = new ArrayList<Program>();
255 for (Action action : programActions) {
256 String time = action.getContent().element("time").getText().trim();
257 Matcher matcher = _pattern.matcher(time);
258 if (matcher.matches()) {
259 Time begin = new Time(Integer.parseInt(matcher.group(1)),
260 Integer.parseInt(matcher.group(2)));
261 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
262 .parseInt(matcher.group(4)));
263 TimeInterval interval = new TimeInterval(begin, end);
264 // Page programInfo = action.execute();
265 // String description =
266 // programInfo.getContent().element("description").getText().trim();
268 // programInfo.getContent().element("keywords").getText().trim();
269 String description = "";
270 String keywords = "";
271 Program program = new Program(aChannel, action.getName(),
272 description, keywords, interval, action);
274 LOG.debug("Got program " + program);
275 programs.add(program);
278 return new Channel(aChannel, programs);
282 * Sends a summary mail to the user.
283 * @param aText Text of the mail.
284 * @throws MessagingException In case of problems sending mail.
286 private void sendMail(String aText) throws MessagingException {
287 Properties props = new Properties();
288 props.put("mail.transport.protocol", "smtp");
289 props.put("mail.smtp.host", "falcon");
290 props.put("mail.smtp.port", "25");
292 Session mailSession = Session.getInstance(props);
293 Message message = new MimeMessage(mailSession);
295 message.setFrom(new InternetAddress("erik@brakkee.org"));
296 message.setRecipient(Message.RecipientType.TO, new InternetAddress(
297 "erik@brakkee.org"));
298 message.setSentDate(new Date());
299 message.setSubject("KiSS crawler update");
300 message.setText(aText);
301 Transport.send(message);