X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fkiss%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fkiss%2FKissCrawler.java;h=fc076a5abaf7294e4fc96d747ee39674955d999e;hb=602681b15e687db2a7ea0e92796857a6df31bba2;hp=298e94435a495c3357aaff8c1a5f7e7f56930e19;hpb=d4bb47fd284738756cd112b788a49caa1a9d5c38;p=utils diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java index 298e9443..fc076a5a 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java @@ -34,7 +34,6 @@ import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.Transport; -import javax.mail.internet.AddressException; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; @@ -52,27 +51,65 @@ import org.wamblee.crawler.impl.ConfigurationParser; import org.wamblee.crawler.impl.CrawlerImpl; /** + * The KiSS crawler for automatic recording of interesting TV shows. * */ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); + /** + * Log file name for the crawler. + */ private static final String LOG_FILE = "kiss.log"; + /** + * Start URL of the electronic programme guide. + */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; + /** + * Crawler configuration file. + */ private static final String CRAWLER_CONFIG = "config.xml"; + /** + * Configuration file describing interesting programs. + */ private static final String PROGRAM_CONFIG = "programs.xml"; + /** + * Regular expression for matching time interval strings in the + * retrieved pages. + */ private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; + /** + * Compiled pattern for the time regular expression. + */ private Pattern _pattern; + /** + * Runs the KiSS crawler. + * @param aArgs Arguments, currently all ignored because they are hardcoded. + * @throws Exception In case of problems. + */ + public static void main(String[] aArgs) throws Exception { + new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); + } + + /** + * Constructs the crawler. This retrieves the TV guide by crawling the + * KiSS EPG guide, filters the guide for interesting programs, tries to + * record them, and sends a summary mail to the user. + * @param aStartUrl Start URL of the electronic programme guide. + * @param aCrawlerConfig Configuration file for the crawler. + * @param aProgramConfig Configuration file describing interesting shows. + * @throws IOException In case of problems reading files. + * @throws MessagingException In case of problems sending a mail notification. + */ public KissCrawler(String aStartUrl, String aCrawlerConfig, - String aProgramConfig) throws IOException, AddressException, - MessagingException { + String aProgramConfig) throws IOException, MessagingException { _pattern = Pattern.compile(TIME_REGEX); @@ -81,7 +118,7 @@ public class KissCrawler { try { HttpClient client = new HttpClient(); - // client.getHostConfiguration().setProxy("localhost", 3128); + client.getHostConfiguration().setProxy("127.0.0.1", 3128); Crawler crawler = createCrawler(aCrawlerConfig, os, client); @@ -103,15 +140,15 @@ public class KissCrawler { } /** - * @param programCondition - * @param guide - * @throws AddressException - * @throws MessagingException + * Records interesting shows. + * @param aProgramCondition Condition determining which shows are interesting. + * @param aGuide Television guide. + * @throws MessagingException In case of problems sending a summary mail. */ - private void recordInterestingShows(Condition programCondition, - TVGuide guide) throws AddressException, MessagingException { - MatchVisitor matcher = new MatchVisitor(programCondition); - guide.accept(matcher); + private void recordInterestingShows(Condition aProgramCondition, + TVGuide aGuide) throws MessagingException { + MatchVisitor matcher = new MatchVisitor(aProgramCondition); + aGuide.accept(matcher); List programs = matcher.getMatches(); String recorded = ""; String notRecorded = ""; @@ -148,30 +185,33 @@ public class KissCrawler { } /** - * @param aCrawlerConfig - * @param os - * @param client - * @return - * @throws FileNotFoundException + * Creates the crawler. + * @param aCrawlerConfig Crawler configuration file. + * @param aOs Logging output stream for the crawler. + * @param aClient HTTP Client to use. + * @return Crawler. + * @throws FileNotFoundException In case configuration files cannot be found. */ - private Crawler createCrawler(String aCrawlerConfig, PrintStream os, - HttpClient client) throws FileNotFoundException { - ConfigurationParser parser = new ConfigurationParser(os); + private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs, + HttpClient aClient) throws FileNotFoundException { + ConfigurationParser parser = new ConfigurationParser(aOs); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); - Crawler crawler = new CrawlerImpl(client, config); + Crawler crawler = new CrawlerImpl(aClient, config); return crawler; } /** - * @param aStartUrl - * @param crawler - * @return + * Gets the start page of the electronic programme guide. This involves login and + * navigation to a suitable start page after logging in. + * @param aStartUrl URL of the electronic programme guide. + * @param aCrawler Crawler to use. + * @return Starting page. */ - private Page getStartPage(String aStartUrl, Crawler crawler) { + private Page getStartPage(String aStartUrl, Crawler aCrawler) { try { - Page page = crawler.getPage(aStartUrl); + Page page = aCrawler.getPage(aStartUrl); return page.getAction("channels-favorites").execute(); } catch (PageException e) { throw new RuntimeException( @@ -179,22 +219,14 @@ public class KissCrawler { } } - public static void main(String[] args) throws Exception { - new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); - } - - private void showPage(Page aPage) { - Action[] links = aPage.getActions(); - for (Action link : links) { - System.out.println("Link found '" + link.getName() + "'"); - } - Element element = aPage.getContent(); - System.out.println("Retrieved content: " + element.asXML()); - } - - private TVGuide createGuide(Page page) { + /** + * Creates the TV guide by web crawling. + * @param aPage Starting page. + * @return TV guide. + */ + private TVGuide createGuide(Page aPage) { LOG.info("Obtaining full TV guide"); - Action[] actions = page.getActions(); + Action[] actions = aPage.getActions(); List channels = new ArrayList(); for (Action action : actions) { try { @@ -210,6 +242,12 @@ public class KissCrawler { return new TVGuide(channels); } + /** + * Create channel information for a specific channel. + * @param aChannel Channel name. + * @param aPage Starting page for the channel. + * @return Channel. + */ private Channel createChannel(String aChannel, Page aPage) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); @@ -240,8 +278,12 @@ public class KissCrawler { return new Channel(aChannel, programs); } - private void sendMail(String aText) throws AddressException, - MessagingException { + /** + * Sends a summary mail to the user. + * @param aText Text of the mail. + * @throws MessagingException In case of problems sending mail. + */ + private void sendMail(String aText) throws MessagingException { Properties props = new Properties(); props.put("mail.transport.protocol", "smtp"); props.put("mail.smtp.host", "falcon");