2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PrintStream;
26 import java.util.ArrayList;
27 import java.util.Date;
28 import java.util.List;
29 import java.util.Properties;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
33 import javax.mail.Message;
34 import javax.mail.MessagingException;
35 import javax.mail.Session;
36 import javax.mail.Transport;
37 import javax.mail.internet.InternetAddress;
38 import javax.mail.internet.MimeMessage;
40 import org.apache.commons.httpclient.HttpClient;
41 import org.apache.commons.logging.Log;
42 import org.apache.commons.logging.LogFactory;
43 import org.wamblee.crawler.Action;
44 import org.wamblee.crawler.Configuration;
45 import org.wamblee.crawler.Crawler;
46 import org.wamblee.crawler.Page;
47 import org.wamblee.crawler.PageException;
48 import org.wamblee.crawler.impl.ConfigurationParser;
49 import org.wamblee.crawler.impl.CrawlerImpl;
52 * The KiSS crawler for automatic recording of interesting TV shows.
55 public class KissCrawler {
57 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
60 * Log file name for the crawler.
62 private static final String LOG_FILE = "kiss.log";
65 * Start URL of the electronic programme guide.
67 private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
70 * Crawler configuration file.
72 private static final String CRAWLER_CONFIG = "config.xml";
75 * Configuration file describing interesting programs.
77 private static final String PROGRAM_CONFIG = "programs.xml";
80 * Regular expression for matching time interval strings in the retrieved
83 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
86 * Compiled pattern for the time regular expression.
88 private Pattern _pattern;
91 * Runs the KiSS crawler.
94 * Arguments, currently all ignored because they are hardcoded.
96 * In case of problems.
98 public static void main(String[] aArgs) throws Exception {
99 new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
103 * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
104 * EPG guide, filters the guide for interesting programs, tries to record
105 * them, and sends a summary mail to the user.
108 * Start URL of the electronic programme guide.
109 * @param aCrawlerConfig
110 * Configuration file for the crawler.
111 * @param aProgramConfig
112 * Configuration file describing interesting shows.
113 * @throws IOException
114 * In case of problems reading files.
115 * @throws MessagingException
116 * In case of problems sending a mail notification.
118 public KissCrawler(String aStartUrl, String aCrawlerConfig,
119 String aProgramConfig) throws IOException, MessagingException {
121 _pattern = Pattern.compile(TIME_REGEX);
123 FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
124 PrintStream os = new PrintStream(fos);
127 HttpClient client = new HttpClient();
128 // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
130 Crawler crawler = createCrawler(aCrawlerConfig, os, client);
131 InputStream programConfigFile = new FileInputStream(new File(
133 List<ProgramFilter> programFilters = new ProgramConfigurationParser()
134 .parse(programConfigFile);
136 Page page = getStartPage(aStartUrl, crawler);
137 TVGuide guide = createGuide(page);
138 PrintVisitor printer = new PrintVisitor(System.out);
139 guide.accept(printer);
140 processResults(programFilters, guide);
144 System.out.println("Output written on '" + LOG_FILE + "'");
149 * Records interesting shows.
151 * @param aProgramCondition
152 * Condition determining which shows are interesting.
155 * @throws MessagingException
156 * In case of problems sending a summary mail.
158 private void processResults(List<ProgramFilter> aProgramCondition,
159 TVGuide aGuide) throws MessagingException {
160 ProgramActionExecutor executor = new ProgramActionExecutor();
161 for (ProgramFilter filter : aProgramCondition) {
162 List<Program> programs = filter.apply(aGuide);
163 ProgramAction action = filter.getAction();
164 for (Program program: programs) {
165 action.execute(program, executor);
169 String msg = executor.getReport();
170 System.out.println(msg);
175 * Creates the crawler.
177 * @param aCrawlerConfig
178 * Crawler configuration file.
180 * Logging output stream for the crawler.
182 * HTTP Client to use.
184 * @throws FileNotFoundException
185 * In case configuration files cannot be found.
187 private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs,
188 HttpClient aClient) throws FileNotFoundException {
189 ConfigurationParser parser = new ConfigurationParser(aOs);
190 InputStream crawlerConfigFile = new FileInputStream(new File(
192 Configuration config = parser.parse(crawlerConfigFile);
193 Crawler crawler = new CrawlerImpl(aClient, config);
198 * Gets the start page of the electronic programme guide. This involves
199 * login and navigation to a suitable start page after logging in.
202 * URL of the electronic programme guide.
205 * @return Starting page.
207 private Page getStartPage(String aStartUrl, Crawler aCrawler) {
209 Page page = aCrawler.getPage(aStartUrl);
210 return page.getAction("channels-favorites").execute();
211 } catch (PageException e) {
212 throw new RuntimeException(
213 "Could not login to electronic program guide", e);
218 * Creates the TV guide by web crawling.
224 private TVGuide createGuide(Page aPage) {
225 LOG.info("Obtaining full TV guide");
226 Action[] actions = aPage.getActions();
227 List<Channel> channels = new ArrayList<Channel>();
228 for (Action action : actions) {
230 LOG.info("Getting channel info for '" + action.getName() + "'");
231 Channel channel = createChannel(action.getName(), action
232 .execute().getAction("right-now").execute());
233 channels.add(channel);
234 if (SystemProperties.isDebugMode()) {
235 break; // Only one channel is crawled.
237 } catch (PageException e) {
238 LOG.error("Could not create channel information for '"
239 + action.getName() + "'", e);
242 return new TVGuide(channels);
246 * Create channel information for a specific channel.
251 * Starting page for the channel.
254 private Channel createChannel(String aChannel, Page aPage) {
255 LOG.info("Obtaining program for " + aChannel);
256 Action[] programActions = aPage.getActions();
257 List<Program> programs = new ArrayList<Program>();
258 for (Action action : programActions) {
259 String time = action.getContent().element("time").getText().trim();
260 Matcher matcher = _pattern.matcher(time);
261 if (matcher.matches()) {
262 Time begin = new Time(Integer.parseInt(matcher.group(1)),
263 Integer.parseInt(matcher.group(2)));
264 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
265 .parseInt(matcher.group(4)));
266 TimeInterval interval = new TimeInterval(begin, end);
267 String description = "";
268 String keywords = "";
269 if (!SystemProperties.isNoProgramDetailsRequired()) {
271 Page programInfo = action.execute();
272 description = programInfo.getContent().element(
273 "description").getText().trim();
274 keywords = programInfo.getContent().element("keywords")
276 } catch (PageException e) {
278 .warn("Program details could not be determined for '"
279 + action.getName() + "'", e);
282 Program program = new Program(aChannel, action.getName(),
283 description, keywords, interval, action);
285 LOG.info("Got program " + program);
286 programs.add(program);
289 return new Channel(aChannel, programs);
293 * Sends a summary mail to the user.
297 * @throws MessagingException
298 * In case of problems sending mail.
300 private void sendMail(String aText) throws MessagingException {
301 Properties props = new Properties();
302 props.put("mail.transport.protocol", "smtp");
303 props.put("mail.smtp.host", "falcon");
304 props.put("mail.smtp.port", "25");
306 Session mailSession = Session.getInstance(props);
307 Message message = new MimeMessage(mailSession);
309 message.setFrom(new InternetAddress("erik@brakkee.org"));
310 message.setRecipient(Message.RecipientType.TO, new InternetAddress(
311 "erik@brakkee.org"));
312 message.setSentDate(new Date());
313 message.setSubject("KiSS crawler update");
314 message.setText(aText);
315 Transport.send(message);