2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PrintStream;
26 import java.util.ArrayList;
27 import java.util.Date;
28 import java.util.List;
29 import java.util.Properties;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
33 import javax.mail.Message;
34 import javax.mail.MessagingException;
35 import javax.mail.Session;
36 import javax.mail.Transport;
37 import javax.mail.internet.AddressException;
38 import javax.mail.internet.InternetAddress;
39 import javax.mail.internet.MimeMessage;
41 import org.apache.commons.httpclient.HttpClient;
42 import org.apache.commons.logging.Log;
43 import org.apache.commons.logging.LogFactory;
44 import org.dom4j.Element;
45 import org.wamblee.conditions.Condition;
46 import org.wamblee.crawler.Action;
47 import org.wamblee.crawler.Configuration;
48 import org.wamblee.crawler.Crawler;
49 import org.wamblee.crawler.Page;
50 import org.wamblee.crawler.PageException;
51 import org.wamblee.crawler.impl.ConfigurationParser;
52 import org.wamblee.crawler.impl.CrawlerImpl;
57 public class KissCrawler {
59 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
61 private static final String LOG_FILE = "kiss.log";
63 private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
65 private static final String CRAWLER_CONFIG = "config.xml";
67 private static final String PROGRAM_CONFIG = "programs.xml";
69 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
71 private Pattern _pattern;
73 public KissCrawler(String aStartUrl, String aCrawlerConfig,
74 String aProgramConfig) throws IOException, AddressException,
77 _pattern = Pattern.compile(TIME_REGEX);
79 FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
80 PrintStream os = new PrintStream(fos);
83 HttpClient client = new HttpClient();
84 // client.getHostConfiguration().setProxy("localhost", 3128);
86 Crawler crawler = createCrawler(aCrawlerConfig, os, client);
88 Page page = getStartPage(aStartUrl, crawler);
89 TVGuide guide = createGuide(page);
90 PrintVisitor printer = new PrintVisitor(System.out);
91 guide.accept(printer);
93 InputStream programConfigFile = new FileInputStream(new File(
95 Condition<Program> programCondition = new ProgramConfigurationParser()
96 .parse(programConfigFile);
97 recordInterestingShows(programCondition, guide);
101 System.out.println("Output written on '" + LOG_FILE + "'");
106 * @param programCondition
108 * @throws AddressException
109 * @throws MessagingException
111 private void recordInterestingShows(Condition<Program> programCondition,
112 TVGuide guide) throws AddressException, MessagingException {
113 MatchVisitor matcher = new MatchVisitor(programCondition);
114 guide.accept(matcher);
115 List<Program> programs = matcher.getMatches();
116 String recorded = "";
117 String notRecorded = "";
118 String failures = "";
119 for (Program program : programs) {
121 boolean result = program.record();
123 recorded += "\n" + program;
125 notRecorded += "\n" + program;
127 } catch (PageException e) {
128 LOG.info("Attempt to record " + program + " failed.");
129 failures += "\n" + program.toString() + ": " + e.getMessage();
132 String msg = "Summary of KiSS crawler: \n\n\n";
134 if (recorded.length() > 0) {
135 msg += "Recorded programs:\n\n" + recorded + "\n\n";
137 if (notRecorded.length() > 0) {
138 msg += "Not recorded programs:\n\n" + notRecorded + "\n\n";
140 if (recorded.length() == 0 && notRecorded.length() == 0) {
141 msg += "No suitable programs found";
143 if (failures.length() > 0) {
144 msg += "Failures:\n\n" + failures;
146 System.out.println(msg);
151 * @param aCrawlerConfig
155 * @throws FileNotFoundException
157 private Crawler createCrawler(String aCrawlerConfig, PrintStream os,
158 HttpClient client) throws FileNotFoundException {
159 ConfigurationParser parser = new ConfigurationParser(os);
160 InputStream crawlerConfigFile = new FileInputStream(new File(
162 Configuration config = parser.parse(crawlerConfigFile);
163 Crawler crawler = new CrawlerImpl(client, config);
172 private Page getStartPage(String aStartUrl, Crawler crawler) {
174 Page page = crawler.getPage(aStartUrl);
175 return page.getAction("channels-favorites").execute();
176 } catch (PageException e) {
177 throw new RuntimeException(
178 "Could not login to electronic program guide", e);
182 public static void main(String[] args) throws Exception {
183 new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
186 private void showPage(Page aPage) {
187 Action[] links = aPage.getActions();
188 for (Action link : links) {
189 System.out.println("Link found '" + link.getName() + "'");
191 Element element = aPage.getContent();
192 System.out.println("Retrieved content: " + element.asXML());
195 private TVGuide createGuide(Page page) {
196 LOG.info("Obtaining full TV guide");
197 Action[] actions = page.getActions();
198 List<Channel> channels = new ArrayList<Channel>();
199 for (Action action : actions) {
201 LOG.info("Getting channel info for '" + action.getName() + "'");
202 Channel channel = createChannel(action.getName(), action
203 .execute().getAction("right-now").execute());
204 channels.add(channel);
205 } catch (PageException e) {
206 LOG.error("Could not create channel information for '"
207 + action.getName() + "'", e);
210 return new TVGuide(channels);
213 private Channel createChannel(String aChannel, Page aPage) {
214 LOG.info("Obtaining program for " + aChannel);
215 Action[] programActions = aPage.getActions();
216 List<Program> programs = new ArrayList<Program>();
217 for (Action action : programActions) {
218 String time = action.getContent().element("time").getText().trim();
219 Matcher matcher = _pattern.matcher(time);
220 if (matcher.matches()) {
221 Time begin = new Time(Integer.parseInt(matcher.group(1)),
222 Integer.parseInt(matcher.group(2)));
223 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
224 .parseInt(matcher.group(4)));
225 TimeInterval interval = new TimeInterval(begin, end);
226 // Page programInfo = action.execute();
227 // String description =
228 // programInfo.getContent().element("description").getText().trim();
230 // programInfo.getContent().element("keywords").getText().trim();
231 String description = "";
232 String keywords = "";
233 Program program = new Program(aChannel, action.getName(),
234 description, keywords, interval, action);
236 LOG.debug("Got program " + program);
237 programs.add(program);
240 return new Channel(aChannel, programs);
243 private void sendMail(String aText) throws AddressException,
245 Properties props = new Properties();
246 props.put("mail.transport.protocol", "smtp");
247 props.put("mail.smtp.host", "falcon");
248 props.put("mail.smtp.port", "25");
250 Session mailSession = Session.getInstance(props);
251 Message message = new MimeMessage(mailSession);
253 message.setFrom(new InternetAddress("erik@brakkee.org"));
254 message.setRecipient(Message.RecipientType.TO, new InternetAddress(
255 "erik@brakkee.org"));
256 message.setSentDate(new Date());
257 message.setSubject("KiSS crawler update");
258 message.setText(aText);
259 Transport.send(message);