/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.kiss; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Element; import org.wamblee.conditions.Condition; import org.wamblee.conditions.OrCondition; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; import org.wamblee.crawler.impl.ConfigurationParser; import org.wamblee.crawler.impl.CrawlerImpl; /** * */ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); private static final String LOG_FILE = "kiss.log"; private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; private static final String CRAWLER_CONFIG = "config.xml"; private static final String PROGRAM_CONFIG = "programs.xml"; private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; private Pattern _pattern; public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws Exception { _pattern = Pattern.compile(TIME_REGEX); FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); PrintStream os = new PrintStream(fos); try { ConfigurationParser parser = new ConfigurationParser(os); InputStream crawlerConfigFile = new FileInputStream(new File(aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); InputStream programConfigFile = new FileInputStream(new File(aProgramConfig)); Condition programCondition = new ProgramConfigurationParser().parse(programConfigFile); HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("localhost", 3128); Crawler crawler = new CrawlerImpl(client, config); Page page = crawler.getPage(aStartUrl); showPage(page); page = page.getAction("channels-favorites").execute(); TVGuide guide = createGuide(page); PrintVisitor printer = new PrintVisitor(System.out); guide.accept(printer); MatchVisitor matcher = new MatchVisitor(programCondition); guide.accept(matcher); List programs = matcher.getMatches(); for (Program program: programs) { System.out.println("Found: " + program + " record: " + program.record() ); } } finally { os.flush(); os.close(); System.out.println("Output written on '" + LOG_FILE + "'"); } } public static void main(String[] args) throws Exception { new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); } private void showPage(Page aPage) { Action[] links = aPage.getActions(); for (Action link : links) { System.out.println("Link found '" + link.getName() + "'"); } Element element = aPage.getContent(); System.out.println("Retrieved content: " + element.asXML()); } private TVGuide createGuide(Page page) { LOG.info("Obtaining full TV guide"); Action[] actions = page.getActions(); List channels = new ArrayList(); for (Action action : actions) { Channel channel = createChannel(action.getName(), action.execute() .getAction("right-now").execute()); channels.add(channel); } return new TVGuide(channels); } private Channel createChannel(String aChannel, Page aPage) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); List programs = new ArrayList(); for (Action action : programActions) { String time = action.getContent().element("time").getText().trim(); Matcher matcher = _pattern.matcher(time); if (matcher.matches()) { Time begin = new Time(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))); Time end = new Time(Integer.parseInt(matcher.group(3)), Integer.parseInt(matcher.group(4))); TimeInterval interval = new TimeInterval(begin, end); //Page programInfo = action.execute(); //String description = programInfo.getContent().element("description").getText().trim(); //String keywords = programInfo.getContent().element("keywords").getText().trim(); String description = ""; String keywords = ""; Program program = new Program(aChannel, action.getName(), description, keywords, interval, action); LOG.debug("Got program " + program); programs.add(program); } } return new Channel(aChannel, programs); } }