X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2Fmain%2FKissCrawler.java;h=db787eb6ea095e37e11717f97df91f9e763406aa;hb=1b449b5589b3ee40489c497d4351877d666b7907;hp=6cdaa17a55da35f9aec05a0794cb03c5e63441c5;hpb=3477b9963f17a8d32a1b62a80f80bcac9df939cf;p=utils diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java index 6cdaa17a..db787eb6 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/main/KissCrawler.java @@ -16,35 +16,21 @@ package org.wamblee.crawler.kiss.main; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.PrintStream; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; import java.util.List; -import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.mail.MessagingException; -import javax.mail.Session; -import javax.mail.internet.InternetAddress; -import javax.xml.transform.TransformerException; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.commons.mail.EmailException; -import org.apache.commons.mail.HtmlEmail; -import org.apache.xml.serialize.OutputFormat; -import org.apache.xml.serialize.XMLSerializer; -import org.w3c.dom.Document; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; @@ -60,8 +46,9 @@ import org.wamblee.crawler.kiss.guide.Time; import org.wamblee.crawler.kiss.guide.TimeInterval; import org.wamblee.crawler.kiss.notification.NotificationException; import org.wamblee.crawler.kiss.notification.Notifier; -import org.wamblee.io.FileResource; -import org.wamblee.xml.XSLT; +import org.wamblee.general.BeanFactory; +import org.wamblee.xml.ClasspathUriResolver; +import org.wamblee.xml.XslTransformer; /** * The KiSS crawler for automatic recording of interesting TV shows. @@ -71,25 +58,15 @@ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); - /** - * Log file name for the crawler. - */ - private static final String LOG_FILE = "kiss.log"; - /** * Start URL of the electronic programme guide. */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; - - /** - * Crawler configuration file. - */ - private static final String CRAWLER_CONFIG = "config.xml"; - + /** - * Configuration file describing interesting programs. + * Default socket timeout to use. */ - private static final String PROGRAM_CONFIG = "programs.xml"; + private static final int SOCKET_TIMEOUT = 10000; /** * Regular expression for matching time interval strings in the retrieved @@ -111,9 +88,36 @@ public class KissCrawler { * In case of problems. */ public static void main(String[] aArgs) throws Exception { - new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); + String crawlerConfig = new File(aArgs[0]).getCanonicalPath(); + String programConfig = new File(aArgs[1]).getCanonicalPath(); + + BeanFactory factory = new StandaloneCrawlerBeanFactory(); + Notifier notifier = factory.find(Notifier.class); + new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig, programConfig, notifier, new Report()); + } + + /** + * Constructs the crawler. This retrieves the TV guide by crawling the KiSS + * EPG guide, filters the guide for interesting programs, tries to record + * them, and sends a summary mail to the user. + * + * @param aCrawlerConfig + * Configuration file for the crawler. + * @param aProgramConfig + * Configuration file describing interesting shows. + * @param aNotifier Object used to send notifications of the results. + * @param aReport Report to use. + * @throws IOException + * In case of problems reading files. + * @throws NotificationException In case notification fails. + * @throws PageException In case of problems retrieving the TV guide. + */ + public KissCrawler(String aCrawlerConfig, + String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException { + this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig, aNotifier, aReport); } + /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record @@ -121,43 +125,54 @@ public class KissCrawler { * * @param aStartUrl * Start URL of the electronic programme guide. + * @param aSocketTimeout Socket timeout to use. * @param aCrawlerConfig * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. + * @param aNotifier Object used to send notifications of the results. + * @param aReport Report to use. * @throws IOException * In case of problems reading files. - * @throws MessagingException - * In case of problems sending a mail notification. + * @throws NotificationException In case notification fails. + * @throws PageException In case of problems retrieving the TV guide. */ - public KissCrawler(String aStartUrl, String aCrawlerConfig, - String aProgramConfig) throws IOException, MessagingException { + public KissCrawler(String aStartUrl, int aSocketTimeout, String aCrawlerConfig, + String aProgramConfig, Notifier aNotifier, Report aReport) throws IOException, NotificationException, PageException { _pattern = Pattern.compile(TIME_REGEX); - FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); - PrintStream os = new PrintStream(fos); - try { HttpClient client = new HttpClient(); - //client.getHostConfiguration().setProxy("127.0.0.1", 3128); + // client.getHostConfiguration().setProxy("127.0.0.1", 3128); + client.getParams().setParameter("http.socket.timeout", SOCKET_TIMEOUT); + + XslTransformer transformer = new XslTransformer( + new ClasspathUriResolver()); - Crawler crawler = createCrawler(aCrawlerConfig, os, client); + Crawler crawler = createCrawler(aCrawlerConfig, client, transformer); InputStream programConfigFile = new FileInputStream(new File( aProgramConfig)); - ProgramConfigurationParser parser = new ProgramConfigurationParser(); + ProgramConfigurationParser parser = new ProgramConfigurationParser( + transformer); parser.parse(programConfigFile); - List programFilters = parser.getFilters(); - - Page page = getStartPage(aStartUrl, crawler); - TVGuide guide = createGuide(page); - PrintVisitor printer = new PrintVisitor(System.out); - guide.accept(printer); - processResults(programFilters, guide, parser.getNotifier()); + List programFilters = parser.getFilters(); + + try { + Page page = getStartPage(aStartUrl, crawler, aReport); + TVGuide guide = createGuide(page, aReport); + PrintVisitor printer = new PrintVisitor(System.out); + guide.accept(printer); + processResults(programFilters, guide, aNotifier, + aReport); + } catch (PageException e) { + aReport.addMessage("Problem getting TV guide", e); + LOG.info("Problem getting TV guide", e); + throw e; + } + aNotifier.send(aReport.asXml()); } finally { - os.flush(); - os.close(); - System.out.println("Output written on '" + LOG_FILE + "'"); + System.out.println("Crawler finished"); } } @@ -172,8 +187,8 @@ public class KissCrawler { * In case of problems sending a summary mail. */ private void processResults(List aProgramCondition, - TVGuide aGuide, Notifier aNotifier) throws MessagingException { - ProgramActionExecutor executor = new ProgramActionExecutor(); + TVGuide aGuide, Notifier aNotifier, Report aReport) { + ProgramActionExecutor executor = new ProgramActionExecutor(aReport); for (ProgramFilter filter : aProgramCondition) { List programs = filter.apply(aGuide); ProgramAction action = filter.getAction(); @@ -182,11 +197,7 @@ public class KissCrawler { } } executor.commit(); - try { - aNotifier.send(executor.getXmlReport()); - } catch (NotificationException e) { - throw new RuntimeException(e); - } + } /** @@ -202,9 +213,9 @@ public class KissCrawler { * @throws FileNotFoundException * In case configuration files cannot be found. */ - private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs, - HttpClient aClient) throws FileNotFoundException { - ConfigurationParser parser = new ConfigurationParser(aOs); + private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient, + XslTransformer aTransformer) throws FileNotFoundException { + ConfigurationParser parser = new ConfigurationParser(aTransformer); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); @@ -220,15 +231,23 @@ public class KissCrawler { * URL of the electronic programme guide. * @param aCrawler * Crawler to use. + * @param aReport + * Report to use. * @return Starting page. */ - private Page getStartPage(String aStartUrl, Crawler aCrawler) { + private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport) + throws PageException { try { Page page = aCrawler.getPage(aStartUrl); - return page.getAction("channels-favorites").execute(); + Action favorites = page.getAction("channels-favorites"); + if (favorites == null) { + String msg = "Channels favorites action not found on start page"; + throw new PageException(msg); + } + return favorites.execute(); } catch (PageException e) { - throw new RuntimeException( - "Could not login to electronic program guide", e); + String msg = "Could not complete login to electronic programme guide."; + throw new PageException(msg, e); } } @@ -237,22 +256,36 @@ public class KissCrawler { * * @param aPage * Starting page. + * @param aReport + * Report to use. * @return TV guide. */ - private TVGuide createGuide(Page aPage) { + private TVGuide createGuide(Page aPage, Report aReport) { LOG.info("Obtaining full TV guide"); Action[] actions = aPage.getActions(); + if ( actions.length == 0 ) { + LOG.error("No channels found"); + aReport.addMessage("No channels found"); + } List channels = new ArrayList(); for (Action action : actions) { try { LOG.info("Getting channel info for '" + action.getName() + "'"); - Channel channel = createChannel(action.getName(), action - .execute().getAction("right-now").execute()); + Action rightNow = action.execute().getAction("right-now"); + if (rightNow == null) { + throw new PageException("Channel summary page for '" + + action.getName() + + "' does not contain required information"); + } + Channel channel = createChannel(action.getName(), rightNow + .execute(), aReport); channels.add(channel); if (SystemProperties.isDebugMode()) { break; // Only one channel is crawled. } } catch (PageException e) { + aReport.addMessage("Could not create channel information for '" + + action.getName() + "'"); LOG.error("Could not create channel information for '" + action.getName() + "'", e); } @@ -269,7 +302,7 @@ public class KissCrawler { * Starting page for the channel. * @return Channel. */ - private Channel createChannel(String aChannel, Page aPage) { + private Channel createChannel(String aChannel, Page aPage, Report aReport) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); List programs = new ArrayList(); @@ -292,9 +325,10 @@ public class KissCrawler { keywords = programInfo.getContent().element("keywords") .getText().trim(); } catch (PageException e) { - LOG.warn( - "Program details could not be determined for '" - + action.getName() + "'", e); + String msg = "Program details could not be determined for '" + + action.getName() + "'"; + aReport.addMessage(msg, e); + LOG.warn(msg, e); } } Program program = new Program(aChannel, action.getName(), @@ -306,51 +340,4 @@ public class KissCrawler { } return new Channel(aChannel, programs); } - - /** - * Sends a summary mail to the user. - * - * @param aText - * Text of the mail. - * @throws MessagingException - * In case of problems sending mail. - */ - private void sendMail(ProgramActionExecutor aExecutor) throws MessagingException { - String textReport = aExecutor.getReport(); - System.out.println("Text report: \n" + textReport); - System.out.println("XML report:\n" + aExecutor.getXmlReport().asXML()); - - - Properties props = new Properties(); - props.put("mail.transport.protocol", "smtp"); - props.put("mail.smtp.host", "falcon"); - props.put("mail.smtp.port", "25"); - - Session mailSession = Session.getInstance(props); - InternetAddress from = new InternetAddress("erik@brakkee.org"); - - HtmlEmail mail = new HtmlEmail(); - mail.setMailSession(mailSession); - try { - mail.setFrom("erik@brakkee.org"); - mail.setTo(Arrays.asList(new InternetAddress[] { from })); - mail.setSentDate(new Date()); - mail.setSubject("KiSS Crawler Update"); - String html = aExecutor.getXmlReport().asXML(); - Document document = new XSLT().transform(html.getBytes(), new FileResource(new File("reportToHtml.xsl"))); - ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); - XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); - serializer.serialize(document); - mail.setHtmlMsg(xhtml.toString()); - mail.setTextMsg(textReport); - mail.send(); - } catch (EmailException e) { - throw new RuntimeException(e); - } catch (TransformerException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }