2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.kiss;
20 import java.io.FileInputStream;
21 import java.io.FileOutputStream;
22 import java.io.InputStream;
23 import java.io.PrintStream;
24 import java.util.ArrayList;
25 import java.util.List;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
29 import org.apache.commons.httpclient.HttpClient;
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32 import org.dom4j.Element;
33 import org.wamblee.conditions.Condition;
34 import org.wamblee.conditions.OrCondition;
35 import org.wamblee.crawler.Action;
36 import org.wamblee.crawler.Configuration;
37 import org.wamblee.crawler.Crawler;
38 import org.wamblee.crawler.Page;
39 import org.wamblee.crawler.impl.ConfigurationParser;
40 import org.wamblee.crawler.impl.CrawlerImpl;
45 public class KissCrawler {
47 private static final Log LOG = LogFactory.getLog(KissCrawler.class);
49 private static final String LOG_FILE = "kiss.log";
51 private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
53 private static final String CRAWLER_CONFIG = "config.xml";
55 private static final String PROGRAM_CONFIG = "programs.xml";
57 private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
59 private Pattern _pattern;
61 public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws Exception {
63 _pattern = Pattern.compile(TIME_REGEX);
65 FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
66 PrintStream os = new PrintStream(fos);
69 ConfigurationParser parser = new ConfigurationParser(os);
70 InputStream crawlerConfigFile = new FileInputStream(new File(aCrawlerConfig));
71 Configuration config = parser.parse(crawlerConfigFile);
73 InputStream programConfigFile = new FileInputStream(new File(aProgramConfig));
74 Condition<Program> programCondition = new ProgramConfigurationParser().parse(programConfigFile);
77 HttpClient client = new HttpClient();
78 // client.getHostConfiguration().setProxy("localhost", 3128);
80 Crawler crawler = new CrawlerImpl(client, config);
82 Page page = crawler.getPage(aStartUrl);
84 page = page.getAction("channels-favorites").execute();
85 TVGuide guide = createGuide(page);
86 PrintVisitor printer = new PrintVisitor(System.out);
87 guide.accept(printer);
89 MatchVisitor matcher = new MatchVisitor(programCondition);
90 guide.accept(matcher);
91 List<Program> programs = matcher.getMatches();
92 for (Program program: programs) {
93 System.out.println("Found: " + program + " record: " + program.record() );
99 System.out.println("Output written on '" + LOG_FILE + "'");
103 public static void main(String[] args) throws Exception {
104 new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
107 private void showPage(Page aPage) {
108 Action[] links = aPage.getActions();
109 for (Action link : links) {
110 System.out.println("Link found '" + link.getName() + "'");
112 Element element = aPage.getContent();
113 System.out.println("Retrieved content: " + element.asXML());
116 private TVGuide createGuide(Page page) {
117 LOG.info("Obtaining full TV guide");
118 Action[] actions = page.getActions();
119 List<Channel> channels = new ArrayList<Channel>();
120 for (Action action : actions) {
121 Channel channel = createChannel(action.getName(), action.execute()
122 .getAction("right-now").execute());
123 channels.add(channel);
125 return new TVGuide(channels);
128 private Channel createChannel(String aChannel, Page aPage) {
129 LOG.info("Obtaining program for " + aChannel);
130 Action[] programActions = aPage.getActions();
131 List<Program> programs = new ArrayList<Program>();
132 for (Action action : programActions) {
133 String time = action.getContent().element("time").getText().trim();
134 Matcher matcher = _pattern.matcher(time);
135 if (matcher.matches()) {
136 Time begin = new Time(Integer.parseInt(matcher.group(1)),
137 Integer.parseInt(matcher.group(2)));
138 Time end = new Time(Integer.parseInt(matcher.group(3)),
139 Integer.parseInt(matcher.group(4)));
140 TimeInterval interval = new TimeInterval(begin, end);
141 //Page programInfo = action.execute();
142 //String description = programInfo.getContent().element("description").getText().trim();
143 //String keywords = programInfo.getContent().element("keywords").getText().trim();
144 String description = "";
145 String keywords = "";
146 Program program = new Program(aChannel, action.getName(), description, keywords, interval, action);
148 LOG.debug("Got program " + program);
149 programs.add(program);
152 return new Channel(aChannel, programs);