X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fmain%2FKissCrawler.java;fp=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fmain%2FKissCrawler.java;h=0000000000000000000000000000000000000000;hb=62f165891f08ae532b5a794af11d7338a93f9a43;hp=3300e1299e71b6c4268d9ba9447b65d6df17fe8f;hpb=07cedd3f0730646ea35a7f668b3e1e872a4605d9;p=utils diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java deleted file mode 100644 index 3300e129..00000000 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright 2005 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.wamblee.crawler.kiss.main; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.mail.MessagingException; - -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.NameValuePair; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.dom4j.Element; -import org.wamblee.crawler.Action; -import org.wamblee.crawler.Configuration; -import org.wamblee.crawler.Crawler; -import org.wamblee.crawler.Page; -import org.wamblee.crawler.PageException; -import org.wamblee.crawler.impl.ConfigurationParser; -import org.wamblee.crawler.impl.CrawlerImpl; -import org.wamblee.crawler.kiss.guide.Channel; -import org.wamblee.crawler.kiss.guide.PrintVisitor; -import org.wamblee.crawler.kiss.guide.Program; -import org.wamblee.crawler.kiss.guide.TVGuide; -import org.wamblee.crawler.kiss.guide.Time; -import org.wamblee.crawler.kiss.guide.TimeInterval; -import org.wamblee.crawler.kiss.notification.NotificationException; -import org.wamblee.crawler.kiss.notification.Notifier; -import org.wamblee.general.BeanFactory; -import org.wamblee.xml.ClasspathUriResolver; -import org.wamblee.xml.XslTransformer; - -/** - * The KiSS crawler for automatic recording of interesting TV shows. - * - */ -public class KissCrawler { - - private static final Log LOG = LogFactory.getLog(KissCrawler.class); - - /** - * Start URL of the electronic programme guide. - */ - private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php"; - - /** - * Default socket timeout to use. - */ - private static final int SOCKET_TIMEOUT = 10000; - - /** - * Regular expression for matching time interval strings in the retrieved - * pages. - */ - private static final String TIME_REGEX = "[^0-9]*([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; - - /** - * Compiled pattern for the time regular expression. - */ - private Pattern _pattern; - - /** - * Runs the KiSS crawler. - * - * @param aArgs - * Arguments, currently all ignored because they are hardcoded. - * @throws Exception - * In case of problems. - */ - public static void main(String[] aArgs) throws Exception { - String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); - String programConfig = new File(aArgs[1]).getCanonicalPath(); - - BeanFactory factory = new StandaloneCrawlerBeanFactory(); - Notifier notifier = factory.find(Notifier.class); - new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, - programConfig, notifier, new Report()); - } - - /** - * Constructs the crawler. This retrieves the TV guide by crawling the KiSS - * EPG guide, filters the guide for interesting programs, tries to record - * them, and sends a summary mail to the user. - * - * @param aCrawlerConfig - * Configuration file for the crawler. - * @param aProgramConfig - * Configuration file describing interesting shows. - * @param aNotifier - * Object used to send notifications of the results. - * @param aReport - * Report to use. - * @throws IOException - * In case of problems reading files. - * @throws NotificationException - * In case notification fails. - * @throws PageException - * In case of problems retrieving the TV guide. - */ - public KissCrawler(String aCrawlerConfig, String aProgramConfig, - Notifier aNotifier, Report aReport) throws IOException, - NotificationException, PageException { - this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, - aNotifier, aReport); - } - - /** - * Constructs the crawler. This retrieves the TV guide by crawling the KiSS - * EPG guide, filters the guide for interesting programs, tries to record - * them, and sends a summary mail to the user. - * - * @param aStartUrl - * Start URL of the electronic programme guide. - * @param aSocketTimeout - * Socket timeout to use. - * @param aCrawlerConfig - * Configuration file for the crawler. - * @param aProgramConfig - * Configuration file describing interesting shows. - * @param aNotifier - * Object used to send notifications of the results. - * @param aReport - * Report to use. - * @throws IOException - * In case of problems reading files. - * @throws NotificationException - * In case notification fails. - * @throws PageException - * In case of problems retrieving the TV guide. - */ - public KissCrawler(String aStartUrl, int aSocketTimeout, - String aCrawlerConfig, String aProgramConfig, Notifier aNotifier, - Report aReport) throws IOException, NotificationException, - PageException { - - _pattern = Pattern.compile(TIME_REGEX); - - try { - HttpClient client = new HttpClient(); - // client.getHostConfiguration().setProxy("127.0.0.1", 3128); - client.getParams().setParameter("http.socket.timeout", - SOCKET_TIMEOUT); - - XslTransformer transformer = new XslTransformer( - new ClasspathUriResolver()); - - Crawler crawler = createCrawler(aCrawlerConfig, client, transformer); - InputStream programConfigFile = new FileInputStream(new File( - aProgramConfig)); - ProgramConfigurationParser parser = new ProgramConfigurationParser(); - parser.parse(programConfigFile); - List programFilters = parser.getFilters(); - - try { - Page page = getStartPage(aStartUrl, crawler, aReport); - TVGuide guide = createGuide(page, aReport); - PrintVisitor printer = new PrintVisitor(System.out); - guide.accept(printer); - processResults(programFilters, guide, aNotifier, aReport); - } catch (PageException e) { - aReport.addMessage("Problem getting TV guide", e); - LOG.info("Problem getting TV guide", e); - throw e; - } - aNotifier.send(aReport.asXml()); - } finally { - System.out.println("Crawler finished"); - } - } - - /** - * Records interesting shows. - * - * @param aProgramCondition - * Condition determining which shows are interesting. - * @param aGuide - * Television guide. - * @throws MessagingException - * In case of problems sending a summary mail. - */ - private void processResults(List aProgramCondition, - TVGuide aGuide, Notifier aNotifier, Report aReport) { - ProgramActionExecutor executor = new ProgramActionExecutor(aReport); - for (ProgramFilter filter : aProgramCondition) { - List programs = filter.apply(aGuide); - ProgramAction action = filter.getAction(); - for (Program program : programs) { - action.execute(program, executor); - } - } - executor.commit(); - - } - - /** - * Creates the crawler. - * - * @param aCrawlerConfig - * Crawler configuration file. - * @param aOs - * Logging output stream for the crawler. - * @param aClient - * HTTP Client to use. - * @return Crawler. - * @throws FileNotFoundException - * In case configuration files cannot be found. - */ - private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient, - XslTransformer aTransformer) throws FileNotFoundException { - ConfigurationParser parser = new ConfigurationParser(aTransformer); - InputStream crawlerConfigFile = new FileInputStream(new File( - aCrawlerConfig)); - Configuration config = parser.parse(crawlerConfigFile); - Crawler crawler = new CrawlerImpl(aClient, config); - return crawler; - } - - /** - * Gets the start page of the electronic programme guide. This involves - * login and navigation to a suitable start page after logging in. - * - * @param aStartUrl - * URL of the electronic programme guide. - * @param aCrawler - * Crawler to use. - * @param aReport - * Report to use. - * @return Starting page. - */ - private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport) - throws PageException { - try { - Page page = aCrawler.getPage(aStartUrl, new NameValuePair[0]); - page = page.getAction("login").execute(); - Action favorites = page.getAction("channels-favorites"); - if (favorites == null) { - String msg = "Channels favorites action not found on start page"; - throw new PageException(msg); - } - return favorites.execute(); - } catch (PageException e) { - String msg = "Could not complete login to electronic programme guide."; - throw new PageException(msg, e); - } - } - - /** - * Creates the TV guide by web crawling. - * - * @param aPage - * Starting page. - * @param aReport - * Report to use. - * @return TV guide. - * @throws PageException - * In case of problem getting the tv guide. - */ - private TVGuide createGuide(Page aPage, Report aReport) - throws PageException { - LOG.info("Obtaining full TV guide"); - Action[] actions = aPage.getActions(); - if (actions.length == 0) { - LOG.error("No channels found"); - throw new PageException("No channels found"); - } - List channels = new ArrayList(); - for (Action action : actions) { - try { - LOG.info("Getting channel info for '" + action.getName() + "'"); - Action tomorrow = action.execute().getAction("tomorrow"); - if (tomorrow == null) { - throw new PageException("Channel summary page for '" - + action.getName() - + "' does not contain required information"); - } - Channel channel = createChannel(action.getName(), tomorrow - .execute(), aReport); - channels.add(channel); - if (SystemProperties.isDebugMode()) { - break; // Only one channel is crawled. - } - } catch (PageException e) { - aReport.addMessage("Could not create channel information for '" - + action.getName() + "'"); - LOG.error("Could not create channel information for '" - + action.getName() + "'", e); - } - } - return new TVGuide(channels); - } - - /** - * Create channel information for a specific channel. - * - * @param aChannel - * Channel name. - * @param aPage - * Starting page for the channel. - * @return Channel. - */ - private Channel createChannel(String aChannel, Page aPage, Report aReport) { - LOG.info("Obtaining program for " + aChannel); - Action[] programActions = aPage.getActions(); - List programs = new ArrayList(); - for (Action action : programActions) { - String time = action.getContent().element("time").getText().trim(); - Matcher matcher = _pattern.matcher(time); - if (matcher.matches()) { - Time begin = new Time(Integer.parseInt(matcher.group(1)), - Integer.parseInt(matcher.group(2))); - Time end = new Time(Integer.parseInt(matcher.group(3)), Integer - .parseInt(matcher.group(4))); - TimeInterval interval = new TimeInterval(begin, end); - String description = ""; - String keywords = ""; - - if (!SystemProperties.isNoProgramDetailsRequired()) { - Element descriptionElem = action.getContent().element( - "description"); - if (descriptionElem == null) { - try { - Page programInfo = action.execute(); - description = programInfo.getContent().element( - "description").getText().trim(); - keywords = programInfo.getContent().element( - "keywords").getText().trim(); - } catch (PageException e) { - String msg = "Program details could not be determined for '" - + action.getName() + "'"; - aReport.addMessage(msg, e); - LOG.warn(msg, e); - } - } else { - description = descriptionElem.getTextTrim(); - } - } - Program program = new Program(aChannel, action.getName(), - description, keywords, interval, action); - - LOG.info("Got program " + program); - programs.add(program); - } - } - return new Channel(aChannel, programs); - } -}