X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2FKissCrawler.java;fp=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2FKissCrawler.java;h=0000000000000000000000000000000000000000;hb=d85bc24e068a68a54786fae5dc71573607b3b0cb;hp=6e705315ef77e459bd7c74bb600d7871658ae1d8;hpb=d62660907eb707c41d5e995b09280dea9dc2ab2a;p=utils diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java deleted file mode 100644 index 6e705315..00000000 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java +++ /dev/null @@ -1,348 +0,0 @@ -/* - * Copyright 2005 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.wamblee.crawler.kiss; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.Properties; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.mail.MessagingException; -import javax.mail.Session; -import javax.mail.internet.InternetAddress; -import javax.xml.transform.TransformerException; - -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.commons.mail.EmailException; -import org.apache.commons.mail.HtmlEmail; -import org.apache.xml.serialize.OutputFormat; -import org.apache.xml.serialize.XMLSerializer; -import org.w3c.dom.Document; -import org.wamblee.crawler.Action; -import org.wamblee.crawler.Configuration; -import org.wamblee.crawler.Crawler; -import org.wamblee.crawler.Page; -import org.wamblee.crawler.PageException; -import org.wamblee.crawler.impl.ConfigurationParser; -import org.wamblee.crawler.impl.CrawlerImpl; -import org.wamblee.io.FileResource; -import org.wamblee.xml.XSLT; - -/** - * The KiSS crawler for automatic recording of interesting TV shows. - * - */ -public class KissCrawler { - - private static final Log LOG = LogFactory.getLog(KissCrawler.class); - - /** - * Log file name for the crawler. - */ - private static final String LOG_FILE = "kiss.log"; - - /** - * Start URL of the electronic programme guide. - */ - private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; - - /** - * Crawler configuration file. - */ - private static final String CRAWLER_CONFIG = "config.xml"; - - /** - * Configuration file describing interesting programs. - */ - private static final String PROGRAM_CONFIG = "programs.xml"; - - /** - * Regular expression for matching time interval strings in the retrieved - * pages. - */ - private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; - - /** - * Compiled pattern for the time regular expression. - */ - private Pattern _pattern; - - /** - * Runs the KiSS crawler. - * - * @param aArgs - * Arguments, currently all ignored because they are hardcoded. - * @throws Exception - * In case of problems. - */ - public static void main(String[] aArgs) throws Exception { - new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); - } - - /** - * Constructs the crawler. This retrieves the TV guide by crawling the KiSS - * EPG guide, filters the guide for interesting programs, tries to record - * them, and sends a summary mail to the user. - * - * @param aStartUrl - * Start URL of the electronic programme guide. - * @param aCrawlerConfig - * Configuration file for the crawler. - * @param aProgramConfig - * Configuration file describing interesting shows. - * @throws IOException - * In case of problems reading files. - * @throws MessagingException - * In case of problems sending a mail notification. - */ - public KissCrawler(String aStartUrl, String aCrawlerConfig, - String aProgramConfig) throws IOException, MessagingException { - - _pattern = Pattern.compile(TIME_REGEX); - - FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); - PrintStream os = new PrintStream(fos); - - try { - HttpClient client = new HttpClient(); - //client.getHostConfiguration().setProxy("127.0.0.1", 3128); - - Crawler crawler = createCrawler(aCrawlerConfig, os, client); - InputStream programConfigFile = new FileInputStream(new File( - aProgramConfig)); - ProgramConfigurationParser parser = new ProgramConfigurationParser(); - parser.parse(programConfigFile); - List programFilters = parser.getFilters(); - - Page page = getStartPage(aStartUrl, crawler); - TVGuide guide = createGuide(page); - PrintVisitor printer = new PrintVisitor(System.out); - guide.accept(printer); - processResults(programFilters, guide, parser.getNotifier()); - } finally { - os.flush(); - os.close(); - System.out.println("Output written on '" + LOG_FILE + "'"); - } - } - - /** - * Records interesting shows. - * - * @param aProgramCondition - * Condition determining which shows are interesting. - * @param aGuide - * Television guide. - * @throws MessagingException - * In case of problems sending a summary mail. - */ - private void processResults(List aProgramCondition, - TVGuide aGuide, Notifier aNotifier) throws MessagingException { - ProgramActionExecutor executor = new ProgramActionExecutor(); - for (ProgramFilter filter : aProgramCondition) { - List programs = filter.apply(aGuide); - ProgramAction action = filter.getAction(); - for (Program program : programs) { - action.execute(program, executor); - } - } - executor.commit(); - try { - aNotifier.send(executor.getXmlReport()); - } catch (NotificationException e) { - throw new RuntimeException(e); - } - } - - /** - * Creates the crawler. - * - * @param aCrawlerConfig - * Crawler configuration file. - * @param aOs - * Logging output stream for the crawler. - * @param aClient - * HTTP Client to use. - * @return Crawler. - * @throws FileNotFoundException - * In case configuration files cannot be found. - */ - private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs, - HttpClient aClient) throws FileNotFoundException { - ConfigurationParser parser = new ConfigurationParser(aOs); - InputStream crawlerConfigFile = new FileInputStream(new File( - aCrawlerConfig)); - Configuration config = parser.parse(crawlerConfigFile); - Crawler crawler = new CrawlerImpl(aClient, config); - return crawler; - } - - /** - * Gets the start page of the electronic programme guide. This involves - * login and navigation to a suitable start page after logging in. - * - * @param aStartUrl - * URL of the electronic programme guide. - * @param aCrawler - * Crawler to use. - * @return Starting page. - */ - private Page getStartPage(String aStartUrl, Crawler aCrawler) { - try { - Page page = aCrawler.getPage(aStartUrl); - return page.getAction("channels-favorites").execute(); - } catch (PageException e) { - throw new RuntimeException( - "Could not login to electronic program guide", e); - } - } - - /** - * Creates the TV guide by web crawling. - * - * @param aPage - * Starting page. - * @return TV guide. - */ - private TVGuide createGuide(Page aPage) { - LOG.info("Obtaining full TV guide"); - Action[] actions = aPage.getActions(); - List channels = new ArrayList(); - for (Action action : actions) { - try { - LOG.info("Getting channel info for '" + action.getName() + "'"); - Channel channel = createChannel(action.getName(), action - .execute().getAction("right-now").execute()); - channels.add(channel); - if (SystemProperties.isDebugMode()) { - break; // Only one channel is crawled. - } - } catch (PageException e) { - LOG.error("Could not create channel information for '" - + action.getName() + "'", e); - } - } - return new TVGuide(channels); - } - - /** - * Create channel information for a specific channel. - * - * @param aChannel - * Channel name. - * @param aPage - * Starting page for the channel. - * @return Channel. - */ - private Channel createChannel(String aChannel, Page aPage) { - LOG.info("Obtaining program for " + aChannel); - Action[] programActions = aPage.getActions(); - List programs = new ArrayList(); - for (Action action : programActions) { - String time = action.getContent().element("time").getText().trim(); - Matcher matcher = _pattern.matcher(time); - if (matcher.matches()) { - Time begin = new Time(Integer.parseInt(matcher.group(1)), - Integer.parseInt(matcher.group(2))); - Time end = new Time(Integer.parseInt(matcher.group(3)), Integer - .parseInt(matcher.group(4))); - TimeInterval interval = new TimeInterval(begin, end); - String description = ""; - String keywords = ""; - if (!SystemProperties.isNoProgramDetailsRequired()) { - try { - Page programInfo = action.execute(); - description = programInfo.getContent().element( - "description").getText().trim(); - keywords = programInfo.getContent().element("keywords") - .getText().trim(); - } catch (PageException e) { - LOG.warn( - "Program details could not be determined for '" - + action.getName() + "'", e); - } - } - Program program = new Program(aChannel, action.getName(), - description, keywords, interval, action); - - LOG.info("Got program " + program); - programs.add(program); - } - } - return new Channel(aChannel, programs); - } - - /** - * Sends a summary mail to the user. - * - * @param aText - * Text of the mail. - * @throws MessagingException - * In case of problems sending mail. - */ - private void sendMail(ProgramActionExecutor aExecutor) throws MessagingException { - String textReport = aExecutor.getReport(); - System.out.println("Text report: \n" + textReport); - System.out.println("XML report:\n" + aExecutor.getXmlReport().asXML()); - - - Properties props = new Properties(); - props.put("mail.transport.protocol", "smtp"); - props.put("mail.smtp.host", "falcon"); - props.put("mail.smtp.port", "25"); - - Session mailSession = Session.getInstance(props); - InternetAddress from = new InternetAddress("erik@brakkee.org"); - - HtmlEmail mail = new HtmlEmail(); - mail.setMailSession(mailSession); - try { - mail.setFrom("erik@brakkee.org"); - mail.setTo(Arrays.asList(new InternetAddress[] { from })); - mail.setSentDate(new Date()); - mail.setSubject("KiSS Crawler Update"); - String html = aExecutor.getXmlReport().asXML(); - Document document = new XSLT().transform(html.getBytes(), new FileResource(new File("reportToHtml.xsl"))); - ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); - XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); - serializer.serialize(document); - mail.setHtmlMsg(xhtml.toString()); - mail.setTextMsg(textReport); - mail.send(); - } catch (EmailException e) { - throw new RuntimeException(e); - } catch (TransformerException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - -}