(no commit message)
[utils] / crawler / kiss / src / org / wamblee / crawler / kiss / KissCrawler.java
1 /*
2  * Copyright 2005 the original author or authors.
3  * 
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  * 
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  * 
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 package org.wamblee.crawler.kiss;
18
19 import java.io.File;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PrintStream;
26 import java.util.ArrayList;
27 import java.util.Date;
28 import java.util.List;
29 import java.util.Properties;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
32
33 import javax.mail.Message;
34 import javax.mail.MessagingException;
35 import javax.mail.Session;
36 import javax.mail.Transport;
37 import javax.mail.internet.AddressException;
38 import javax.mail.internet.InternetAddress;
39 import javax.mail.internet.MimeMessage;
40
41 import org.apache.commons.httpclient.HttpClient;
42 import org.apache.commons.logging.Log;
43 import org.apache.commons.logging.LogFactory;
44 import org.dom4j.Element;
45 import org.wamblee.conditions.Condition;
46 import org.wamblee.crawler.Action;
47 import org.wamblee.crawler.Configuration;
48 import org.wamblee.crawler.Crawler;
49 import org.wamblee.crawler.Page;
50 import org.wamblee.crawler.PageException;
51 import org.wamblee.crawler.impl.ConfigurationParser;
52 import org.wamblee.crawler.impl.CrawlerImpl;
53
54 /**
55  * 
56  */
57 public class KissCrawler {
58
59     private static final Log LOG = LogFactory.getLog(KissCrawler.class);
60
61     private static final String LOG_FILE = "kiss.log";
62
63     private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
64
65     private static final String CRAWLER_CONFIG = "config.xml";
66
67     private static final String PROGRAM_CONFIG = "programs.xml";
68
69     private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
70
71     private Pattern _pattern;
72
73     public KissCrawler(String aStartUrl, String aCrawlerConfig,
74             String aProgramConfig) throws IOException, AddressException,
75             MessagingException {
76
77         _pattern = Pattern.compile(TIME_REGEX);
78
79         FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
80         PrintStream os = new PrintStream(fos);
81
82         try {
83             HttpClient client = new HttpClient();
84             // client.getHostConfiguration().setProxy("localhost", 3128);
85
86             Crawler crawler = createCrawler(aCrawlerConfig, os, client);
87
88             Page page = getStartPage(aStartUrl, crawler);
89             TVGuide guide = createGuide(page);
90             PrintVisitor printer = new PrintVisitor(System.out);
91             guide.accept(printer);
92
93             InputStream programConfigFile = new FileInputStream(new File(
94                     aProgramConfig));
95             Condition<Program> programCondition = new ProgramConfigurationParser()
96                     .parse(programConfigFile);
97             recordInterestingShows(programCondition, guide);
98         } finally {
99             os.flush();
100             os.close();
101             System.out.println("Output written on '" + LOG_FILE + "'");
102         }
103     }
104
105     /**
106      * @param programCondition
107      * @param guide
108      * @throws AddressException
109      * @throws MessagingException
110      */
111     private void recordInterestingShows(Condition<Program> programCondition,
112             TVGuide guide) throws AddressException, MessagingException {
113         MatchVisitor matcher = new MatchVisitor(programCondition);
114         guide.accept(matcher);
115         List<Program> programs = matcher.getMatches();
116         String recorded = "";
117         String notRecorded = "";
118         String failures = "";
119         for (Program program : programs) {
120             try {
121                 boolean result = program.record();
122                 if (result) {
123                     recorded += "\n" + program;
124                 } else {
125                     notRecorded += "\n" + program;
126                 }
127             } catch (PageException e) {
128                 LOG.info("Attempt to record " + program + " failed.");
129                 failures += "\n" + program.toString() + ": " + e.getMessage();
130             }
131         }
132         String msg = "Summary of KiSS crawler: \n\n\n";
133
134         if (recorded.length() > 0) {
135             msg += "Recorded programs:\n\n" + recorded + "\n\n";
136         }
137         if (notRecorded.length() > 0) {
138             msg += "Not recorded programs:\n\n" + notRecorded + "\n\n";
139         }
140         if (recorded.length() == 0 && notRecorded.length() == 0) {
141             msg += "No suitable programs found";
142         }
143         if (failures.length() > 0) {
144             msg += "Failures:\n\n" + failures;
145         }
146         System.out.println(msg);
147         sendMail(msg);
148     }
149
150     /**
151      * @param aCrawlerConfig
152      * @param os
153      * @param client
154      * @return
155      * @throws FileNotFoundException
156      */
157     private Crawler createCrawler(String aCrawlerConfig, PrintStream os,
158             HttpClient client) throws FileNotFoundException {
159         ConfigurationParser parser = new ConfigurationParser(os);
160         InputStream crawlerConfigFile = new FileInputStream(new File(
161                 aCrawlerConfig));
162         Configuration config = parser.parse(crawlerConfigFile);
163         Crawler crawler = new CrawlerImpl(client, config);
164         return crawler;
165     }
166
167     /**
168      * @param aStartUrl
169      * @param crawler
170      * @return
171      */
172     private Page getStartPage(String aStartUrl, Crawler crawler) {
173         try {
174             Page page = crawler.getPage(aStartUrl);
175             return page.getAction("channels-favorites").execute();
176         } catch (PageException e) {
177             throw new RuntimeException(
178                     "Could not login to electronic program guide", e);
179         }
180     }
181
182     public static void main(String[] args) throws Exception {
183         new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
184     }
185
186     private void showPage(Page aPage) {
187         Action[] links = aPage.getActions();
188         for (Action link : links) {
189             System.out.println("Link found '" + link.getName() + "'");
190         }
191         Element element = aPage.getContent();
192         System.out.println("Retrieved content: " + element.asXML());
193     }
194
195     private TVGuide createGuide(Page page) {
196         LOG.info("Obtaining full TV guide");
197         Action[] actions = page.getActions();
198         List<Channel> channels = new ArrayList<Channel>();
199         for (Action action : actions) {
200             try {
201                 LOG.info("Getting channel info for '" + action.getName() + "'");
202                 Channel channel = createChannel(action.getName(), action
203                         .execute().getAction("right-now").execute());
204                 channels.add(channel);
205             } catch (PageException e) {
206                 LOG.error("Could not create channel information for '"
207                         + action.getName() + "'", e);
208             }
209         }
210         return new TVGuide(channels);
211     }
212
213     private Channel createChannel(String aChannel, Page aPage) {
214         LOG.info("Obtaining program for " + aChannel);
215         Action[] programActions = aPage.getActions();
216         List<Program> programs = new ArrayList<Program>();
217         for (Action action : programActions) {
218             String time = action.getContent().element("time").getText().trim();
219             Matcher matcher = _pattern.matcher(time);
220             if (matcher.matches()) {
221                 Time begin = new Time(Integer.parseInt(matcher.group(1)),
222                         Integer.parseInt(matcher.group(2)));
223                 Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
224                         .parseInt(matcher.group(4)));
225                 TimeInterval interval = new TimeInterval(begin, end);
226                 // Page programInfo = action.execute();
227                 // String description =
228                 // programInfo.getContent().element("description").getText().trim();
229                 // String keywords =
230                 // programInfo.getContent().element("keywords").getText().trim();
231                 String description = "";
232                 String keywords = "";
233                 Program program = new Program(aChannel, action.getName(),
234                         description, keywords, interval, action);
235
236                 LOG.debug("Got program " + program);
237                 programs.add(program);
238             }
239         }
240         return new Channel(aChannel, programs);
241     }
242
243     private void sendMail(String aText) throws AddressException,
244             MessagingException {
245         Properties props = new Properties();
246         props.put("mail.transport.protocol", "smtp");
247         props.put("mail.smtp.host", "falcon");
248         props.put("mail.smtp.port", "25");
249
250         Session mailSession = Session.getInstance(props);
251         Message message = new MimeMessage(mailSession);
252
253         message.setFrom(new InternetAddress("erik@brakkee.org"));
254         message.setRecipient(Message.RecipientType.TO, new InternetAddress(
255                 "erik@brakkee.org"));
256         message.setSentDate(new Date());
257         message.setSubject("KiSS crawler update");
258         message.setText(aText);
259         Transport.send(message);
260     }
261 }