X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2FKissCrawler.java;h=0549e48f3a8a99910512c8ca125f574a021d027c;hb=c0da3814aaa1e707d253202ceb44fa745c671de8;hp=298e94435a495c3357aaff8c1a5f7e7f56930e19;hpb=d4bb47fd284738756cd112b788a49caa1a9d5c38;p=utils diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java index 298e9443..0549e48f 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java @@ -16,6 +16,7 @@ package org.wamblee.crawler.kiss; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -24,6 +25,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.Properties; @@ -34,15 +36,18 @@ import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.Transport; -import javax.mail.internet.AddressException; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; +import javax.xml.transform.TransformerException; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.dom4j.Element; -import org.wamblee.conditions.Condition; +import org.apache.commons.mail.EmailException; +import org.apache.commons.mail.HtmlEmail; +import org.apache.xml.serialize.OutputFormat; +import org.apache.xml.serialize.XMLSerializer; +import org.w3c.dom.Document; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; @@ -50,29 +55,78 @@ import org.wamblee.crawler.Page; import org.wamblee.crawler.PageException; import org.wamblee.crawler.impl.ConfigurationParser; import org.wamblee.crawler.impl.CrawlerImpl; +import org.wamblee.io.FileResource; +import org.wamblee.xml.XSLT; /** + * The KiSS crawler for automatic recording of interesting TV shows. * */ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); + /** + * Log file name for the crawler. + */ private static final String LOG_FILE = "kiss.log"; + /** + * Start URL of the electronic programme guide. + */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; + /** + * Crawler configuration file. + */ private static final String CRAWLER_CONFIG = "config.xml"; + /** + * Configuration file describing interesting programs. + */ private static final String PROGRAM_CONFIG = "programs.xml"; + /** + * Regular expression for matching time interval strings in the retrieved + * pages. + */ private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; + /** + * Compiled pattern for the time regular expression. + */ private Pattern _pattern; + /** + * Runs the KiSS crawler. + * + * @param aArgs + * Arguments, currently all ignored because they are hardcoded. + * @throws Exception + * In case of problems. + */ + public static void main(String[] aArgs) throws Exception { + new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); + } + + /** + * Constructs the crawler. This retrieves the TV guide by crawling the KiSS + * EPG guide, filters the guide for interesting programs, tries to record + * them, and sends a summary mail to the user. + * + * @param aStartUrl + * Start URL of the electronic programme guide. + * @param aCrawlerConfig + * Configuration file for the crawler. + * @param aProgramConfig + * Configuration file describing interesting shows. + * @throws IOException + * In case of problems reading files. + * @throws MessagingException + * In case of problems sending a mail notification. + */ public KissCrawler(String aStartUrl, String aCrawlerConfig, - String aProgramConfig) throws IOException, AddressException, - MessagingException { + String aProgramConfig) throws IOException, MessagingException { _pattern = Pattern.compile(TIME_REGEX); @@ -81,20 +135,19 @@ public class KissCrawler { try { HttpClient client = new HttpClient(); - // client.getHostConfiguration().setProxy("localhost", 3128); + // client.getHostConfiguration().setProxy("127.0.0.1", 3128); Crawler crawler = createCrawler(aCrawlerConfig, os, client); + InputStream programConfigFile = new FileInputStream(new File( + aProgramConfig)); + List programFilters = new ProgramConfigurationParser() + .parse(programConfigFile); Page page = getStartPage(aStartUrl, crawler); TVGuide guide = createGuide(page); PrintVisitor printer = new PrintVisitor(System.out); guide.accept(printer); - - InputStream programConfigFile = new FileInputStream(new File( - aProgramConfig)); - Condition programCondition = new ProgramConfigurationParser() - .parse(programConfigFile); - recordInterestingShows(programCondition, guide); + processResults(programFilters, guide); } finally { os.flush(); os.close(); @@ -103,75 +156,65 @@ public class KissCrawler { } /** - * @param programCondition - * @param guide - * @throws AddressException + * Records interesting shows. + * + * @param aProgramCondition + * Condition determining which shows are interesting. + * @param aGuide + * Television guide. * @throws MessagingException + * In case of problems sending a summary mail. */ - private void recordInterestingShows(Condition programCondition, - TVGuide guide) throws AddressException, MessagingException { - MatchVisitor matcher = new MatchVisitor(programCondition); - guide.accept(matcher); - List programs = matcher.getMatches(); - String recorded = ""; - String notRecorded = ""; - String failures = ""; - for (Program program : programs) { - try { - boolean result = program.record(); - if (result) { - recorded += "\n" + program; - } else { - notRecorded += "\n" + program; - } - } catch (PageException e) { - LOG.info("Attempt to record " + program + " failed."); - failures += "\n" + program.toString() + ": " + e.getMessage(); + private void processResults(List aProgramCondition, + TVGuide aGuide) throws MessagingException { + ProgramActionExecutor executor = new ProgramActionExecutor(); + for (ProgramFilter filter : aProgramCondition) { + List programs = filter.apply(aGuide); + ProgramAction action = filter.getAction(); + for (Program program : programs) { + action.execute(program, executor); } } - String msg = "Summary of KiSS crawler: \n\n\n"; - - if (recorded.length() > 0) { - msg += "Recorded programs:\n\n" + recorded + "\n\n"; - } - if (notRecorded.length() > 0) { - msg += "Not recorded programs:\n\n" + notRecorded + "\n\n"; - } - if (recorded.length() == 0 && notRecorded.length() == 0) { - msg += "No suitable programs found"; - } - if (failures.length() > 0) { - msg += "Failures:\n\n" + failures; - } - System.out.println(msg); - sendMail(msg); + executor.commit(); + sendMail(executor); } /** + * Creates the crawler. + * * @param aCrawlerConfig - * @param os - * @param client - * @return + * Crawler configuration file. + * @param aOs + * Logging output stream for the crawler. + * @param aClient + * HTTP Client to use. + * @return Crawler. * @throws FileNotFoundException + * In case configuration files cannot be found. */ - private Crawler createCrawler(String aCrawlerConfig, PrintStream os, - HttpClient client) throws FileNotFoundException { - ConfigurationParser parser = new ConfigurationParser(os); + private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs, + HttpClient aClient) throws FileNotFoundException { + ConfigurationParser parser = new ConfigurationParser(aOs); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); - Crawler crawler = new CrawlerImpl(client, config); + Crawler crawler = new CrawlerImpl(aClient, config); return crawler; } /** + * Gets the start page of the electronic programme guide. This involves + * login and navigation to a suitable start page after logging in. + * * @param aStartUrl - * @param crawler - * @return + * URL of the electronic programme guide. + * @param aCrawler + * Crawler to use. + * @return Starting page. */ - private Page getStartPage(String aStartUrl, Crawler crawler) { + private Page getStartPage(String aStartUrl, Crawler aCrawler) { try { - Page page = crawler.getPage(aStartUrl); + Page page = aCrawler.getPage(aStartUrl); return page.getAction("channels-favorites").execute(); } catch (PageException e) { throw new RuntimeException( @@ -179,22 +222,16 @@ public class KissCrawler { } } - public static void main(String[] args) throws Exception { - new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); - } - - private void showPage(Page aPage) { - Action[] links = aPage.getActions(); - for (Action link : links) { - System.out.println("Link found '" + link.getName() + "'"); - } - Element element = aPage.getContent(); - System.out.println("Retrieved content: " + element.asXML()); - } - - private TVGuide createGuide(Page page) { + /** + * Creates the TV guide by web crawling. + * + * @param aPage + * Starting page. + * @return TV guide. + */ + private TVGuide createGuide(Page aPage) { LOG.info("Obtaining full TV guide"); - Action[] actions = page.getActions(); + Action[] actions = aPage.getActions(); List channels = new ArrayList(); for (Action action : actions) { try { @@ -202,6 +239,9 @@ public class KissCrawler { Channel channel = createChannel(action.getName(), action .execute().getAction("right-now").execute()); channels.add(channel); + if (SystemProperties.isDebugMode()) { + break; // Only one channel is crawled. + } } catch (PageException e) { LOG.error("Could not create channel information for '" + action.getName() + "'", e); @@ -210,6 +250,15 @@ public class KissCrawler { return new TVGuide(channels); } + /** + * Create channel information for a specific channel. + * + * @param aChannel + * Channel name. + * @param aPage + * Starting page for the channel. + * @return Channel. + */ private Channel createChannel(String aChannel, Page aPage) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); @@ -223,39 +272,75 @@ public class KissCrawler { Time end = new Time(Integer.parseInt(matcher.group(3)), Integer .parseInt(matcher.group(4))); TimeInterval interval = new TimeInterval(begin, end); - // Page programInfo = action.execute(); - // String description = - // programInfo.getContent().element("description").getText().trim(); - // String keywords = - // programInfo.getContent().element("keywords").getText().trim(); String description = ""; String keywords = ""; + if (!SystemProperties.isNoProgramDetailsRequired()) { + try { + Page programInfo = action.execute(); + description = programInfo.getContent().element( + "description").getText().trim(); + keywords = programInfo.getContent().element("keywords") + .getText().trim(); + } catch (PageException e) { + LOG.warn( + "Program details could not be determined for '" + + action.getName() + "'", e); + } + } Program program = new Program(aChannel, action.getName(), description, keywords, interval, action); - LOG.debug("Got program " + program); + LOG.info("Got program " + program); programs.add(program); } } return new Channel(aChannel, programs); } - private void sendMail(String aText) throws AddressException, - MessagingException { + /** + * Sends a summary mail to the user. + * + * @param aText + * Text of the mail. + * @throws MessagingException + * In case of problems sending mail. + */ + private void sendMail(ProgramActionExecutor aExecutor) throws MessagingException { + String textReport = aExecutor.getReport(); + System.out.println("Text report: \n" + textReport); + System.out.println("XML report:\n" + aExecutor.getXmlReport().asXML()); + + Properties props = new Properties(); props.put("mail.transport.protocol", "smtp"); props.put("mail.smtp.host", "falcon"); props.put("mail.smtp.port", "25"); Session mailSession = Session.getInstance(props); - Message message = new MimeMessage(mailSession); - - message.setFrom(new InternetAddress("erik@brakkee.org")); - message.setRecipient(Message.RecipientType.TO, new InternetAddress( - "erik@brakkee.org")); - message.setSentDate(new Date()); - message.setSubject("KiSS crawler update"); - message.setText(aText); - Transport.send(message); + InternetAddress from = new InternetAddress("erik@brakkee.org"); + + HtmlEmail mail = new HtmlEmail(); + mail.setMailSession(mailSession); + try { + mail.setFrom("erik@brakkee.org"); + mail.setTo(Arrays.asList(new InternetAddress[] { from })); + mail.setSentDate(new Date()); + mail.setSubject("KiSS Crawler Update"); + String html = aExecutor.getXmlReport().asXML(); + Document document = new XSLT().transform(html.getBytes(), new FileResource(new File("reportToHtml.xsl"))); + ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); + XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); + serializer.serialize(document); + mail.setHtmlMsg(xhtml.toString()); + mail.setTextMsg(textReport); + mail.send(); + } catch (EmailException e) { + throw new RuntimeException(e); + } catch (TransformerException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } } + }