Migration to maven almost complete. At least everything builds and works
[utils] / crawler / kiss / src / main / java / org / wamblee / crawler / kiss / main / KissCrawler.java
diff --git a/crawler/kiss/src/main/java/org/wamblee/crawler/kiss/main/KissCrawler.java b/crawler/kiss/src/main/java/org/wamblee/crawler/kiss/main/KissCrawler.java
new file mode 100644 (file)
index 0000000..3300e12
--- /dev/null
@@ -0,0 +1,368 @@
+/*
+ * Copyright 2005 the original author or authors.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss.main;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.mail.MessagingException;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Element;
+import org.wamblee.crawler.Action;
+import org.wamblee.crawler.Configuration;
+import org.wamblee.crawler.Crawler;
+import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
+import org.wamblee.crawler.impl.ConfigurationParser;
+import org.wamblee.crawler.impl.CrawlerImpl;
+import org.wamblee.crawler.kiss.guide.Channel;
+import org.wamblee.crawler.kiss.guide.PrintVisitor;
+import org.wamblee.crawler.kiss.guide.Program;
+import org.wamblee.crawler.kiss.guide.TVGuide;
+import org.wamblee.crawler.kiss.guide.Time;
+import org.wamblee.crawler.kiss.guide.TimeInterval;
+import org.wamblee.crawler.kiss.notification.NotificationException;
+import org.wamblee.crawler.kiss.notification.Notifier;
+import org.wamblee.general.BeanFactory;
+import org.wamblee.xml.ClasspathUriResolver;
+import org.wamblee.xml.XslTransformer;
+
+/**
+ * The KiSS crawler for automatic recording of interesting TV shows.
+ * 
+ */
+public class KissCrawler {
+
+    private static final Log LOG = LogFactory.getLog(KissCrawler.class);
+
+    /**
+     * Start URL of the electronic programme guide.
+     */
+    private static final String START_URL = "http://epg.kml.kiss-technology.com/login.php";
+
+    /**
+     * Default socket timeout to use.
+     */
+    private static final int SOCKET_TIMEOUT = 10000;
+
+    /**
+     * Regular expression for matching time interval strings in the retrieved
+     * pages.
+     */
+    private static final String TIME_REGEX = "[^0-9]*([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
+
+    /**
+     * Compiled pattern for the time regular expression.
+     */
+    private Pattern _pattern;
+
+    /**
+     * Runs the KiSS crawler.
+     * 
+     * @param aArgs
+     *            Arguments, currently all ignored because they are hardcoded.
+     * @throws Exception
+     *             In case of problems.
+     */
+    public static void main(String[] aArgs) throws Exception {
+        String crawlerConfig = new File(aArgs[0]).getCanonicalPath();
+        String programConfig = new File(aArgs[1]).getCanonicalPath();
+
+        BeanFactory factory = new StandaloneCrawlerBeanFactory();
+        Notifier notifier = factory.find(Notifier.class);
+        new KissCrawler(START_URL, SOCKET_TIMEOUT, crawlerConfig,
+                programConfig, notifier, new Report());
+    }
+
+    /**
+     * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
+     * EPG guide, filters the guide for interesting programs, tries to record
+     * them, and sends a summary mail to the user.
+     * 
+     * @param aCrawlerConfig
+     *            Configuration file for the crawler.
+     * @param aProgramConfig
+     *            Configuration file describing interesting shows.
+     * @param aNotifier
+     *            Object used to send notifications of the results.
+     * @param aReport
+     *            Report to use.
+     * @throws IOException
+     *             In case of problems reading files.
+     * @throws NotificationException
+     *             In case notification fails.
+     * @throws PageException
+     *             In case of problems retrieving the TV guide.
+     */
+    public KissCrawler(String aCrawlerConfig, String aProgramConfig,
+            Notifier aNotifier, Report aReport) throws IOException,
+            NotificationException, PageException {
+        this(START_URL, SOCKET_TIMEOUT, aCrawlerConfig, aProgramConfig,
+                aNotifier, aReport);
+    }
+
+    /**
+     * Constructs the crawler. This retrieves the TV guide by crawling the KiSS
+     * EPG guide, filters the guide for interesting programs, tries to record
+     * them, and sends a summary mail to the user.
+     * 
+     * @param aStartUrl
+     *            Start URL of the electronic programme guide.
+     * @param aSocketTimeout
+     *            Socket timeout to use.
+     * @param aCrawlerConfig
+     *            Configuration file for the crawler.
+     * @param aProgramConfig
+     *            Configuration file describing interesting shows.
+     * @param aNotifier
+     *            Object used to send notifications of the results.
+     * @param aReport
+     *            Report to use.
+     * @throws IOException
+     *             In case of problems reading files.
+     * @throws NotificationException
+     *             In case notification fails.
+     * @throws PageException
+     *             In case of problems retrieving the TV guide.
+     */
+    public KissCrawler(String aStartUrl, int aSocketTimeout,
+            String aCrawlerConfig, String aProgramConfig, Notifier aNotifier,
+            Report aReport) throws IOException, NotificationException,
+            PageException {
+
+        _pattern = Pattern.compile(TIME_REGEX);
+
+        try {
+            HttpClient client = new HttpClient();
+            // client.getHostConfiguration().setProxy("127.0.0.1", 3128);
+            client.getParams().setParameter("http.socket.timeout",
+                    SOCKET_TIMEOUT);
+
+            XslTransformer transformer = new XslTransformer(
+                    new ClasspathUriResolver());
+
+            Crawler crawler = createCrawler(aCrawlerConfig, client, transformer);
+            InputStream programConfigFile = new FileInputStream(new File(
+                    aProgramConfig));
+            ProgramConfigurationParser parser = new ProgramConfigurationParser();
+            parser.parse(programConfigFile);
+            List<ProgramFilter> programFilters = parser.getFilters();
+
+            try {
+                Page page = getStartPage(aStartUrl, crawler, aReport);
+                TVGuide guide = createGuide(page, aReport);
+                PrintVisitor printer = new PrintVisitor(System.out);
+                guide.accept(printer);
+                processResults(programFilters, guide, aNotifier, aReport);
+            } catch (PageException e) {
+                aReport.addMessage("Problem getting TV guide", e);
+                LOG.info("Problem getting TV guide", e);
+                throw e;
+            }
+            aNotifier.send(aReport.asXml());
+        } finally {
+            System.out.println("Crawler finished");
+        }
+    }
+
+    /**
+     * Records interesting shows.
+     * 
+     * @param aProgramCondition
+     *            Condition determining which shows are interesting.
+     * @param aGuide
+     *            Television guide.
+     * @throws MessagingException
+     *             In case of problems sending a summary mail.
+     */
+    private void processResults(List<ProgramFilter> aProgramCondition,
+            TVGuide aGuide, Notifier aNotifier, Report aReport) {
+        ProgramActionExecutor executor = new ProgramActionExecutor(aReport);
+        for (ProgramFilter filter : aProgramCondition) {
+            List<Program> programs = filter.apply(aGuide);
+            ProgramAction action = filter.getAction();
+            for (Program program : programs) {
+                action.execute(program, executor);
+            }
+        }
+        executor.commit();
+
+    }
+
+    /**
+     * Creates the crawler.
+     * 
+     * @param aCrawlerConfig
+     *            Crawler configuration file.
+     * @param aOs
+     *            Logging output stream for the crawler.
+     * @param aClient
+     *            HTTP Client to use.
+     * @return Crawler.
+     * @throws FileNotFoundException
+     *             In case configuration files cannot be found.
+     */
+    private Crawler createCrawler(String aCrawlerConfig, HttpClient aClient,
+            XslTransformer aTransformer) throws FileNotFoundException {
+        ConfigurationParser parser = new ConfigurationParser(aTransformer);
+        InputStream crawlerConfigFile = new FileInputStream(new File(
+                aCrawlerConfig));
+        Configuration config = parser.parse(crawlerConfigFile);
+        Crawler crawler = new CrawlerImpl(aClient, config);
+        return crawler;
+    }
+
+    /**
+     * Gets the start page of the electronic programme guide. This involves
+     * login and navigation to a suitable start page after logging in.
+     * 
+     * @param aStartUrl
+     *            URL of the electronic programme guide.
+     * @param aCrawler
+     *            Crawler to use.
+     * @param aReport
+     *            Report to use.
+     * @return Starting page.
+     */
+    private Page getStartPage(String aStartUrl, Crawler aCrawler, Report aReport)
+            throws PageException {
+        try {
+            Page page = aCrawler.getPage(aStartUrl, new NameValuePair[0]);
+            page = page.getAction("login").execute();
+            Action favorites = page.getAction("channels-favorites");
+            if (favorites == null) {
+                String msg = "Channels favorites action not found on start page";
+                throw new PageException(msg);
+            }
+            return favorites.execute();
+        } catch (PageException e) {
+            String msg = "Could not complete login to electronic programme guide.";
+            throw new PageException(msg, e);
+        }
+    }
+
+    /**
+     * Creates the TV guide by web crawling.
+     * 
+     * @param aPage
+     *            Starting page.
+     * @param aReport
+     *            Report to use.
+     * @return TV guide.
+     * @throws PageException
+     *             In case of problem getting the tv guide.
+     */
+    private TVGuide createGuide(Page aPage, Report aReport)
+            throws PageException {
+        LOG.info("Obtaining full TV guide");
+        Action[] actions = aPage.getActions();
+        if (actions.length == 0) {
+            LOG.error("No channels found");
+            throw new PageException("No channels found");
+        }
+        List<Channel> channels = new ArrayList<Channel>();
+        for (Action action : actions) {
+            try {
+                LOG.info("Getting channel info for '" + action.getName() + "'");
+                Action tomorrow = action.execute().getAction("tomorrow");
+                if (tomorrow == null) {
+                    throw new PageException("Channel summary page for '"
+                            + action.getName()
+                            + "' does not contain required information");
+                }
+                Channel channel = createChannel(action.getName(), tomorrow
+                        .execute(), aReport);
+                channels.add(channel);
+                if (SystemProperties.isDebugMode()) {
+                    break; // Only one channel is crawled.
+                }
+            } catch (PageException e) {
+                aReport.addMessage("Could not create channel information for '"
+                        + action.getName() + "'");
+                LOG.error("Could not create channel information for '"
+                        + action.getName() + "'", e);
+            }
+        }
+        return new TVGuide(channels);
+    }
+
+    /**
+     * Create channel information for a specific channel.
+     * 
+     * @param aChannel
+     *            Channel name.
+     * @param aPage
+     *            Starting page for the channel.
+     * @return Channel.
+     */
+    private Channel createChannel(String aChannel, Page aPage, Report aReport) {
+        LOG.info("Obtaining program for " + aChannel);
+        Action[] programActions = aPage.getActions();
+        List<Program> programs = new ArrayList<Program>();
+        for (Action action : programActions) {
+            String time = action.getContent().element("time").getText().trim();
+            Matcher matcher = _pattern.matcher(time);
+            if (matcher.matches()) {
+                Time begin = new Time(Integer.parseInt(matcher.group(1)),
+                        Integer.parseInt(matcher.group(2)));
+                Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
+                        .parseInt(matcher.group(4)));
+                TimeInterval interval = new TimeInterval(begin, end);
+                String description = "";
+                String keywords = "";
+
+                if (!SystemProperties.isNoProgramDetailsRequired()) {
+                    Element descriptionElem = action.getContent().element(
+                            "description");
+                    if (descriptionElem == null) {
+                        try {
+                            Page programInfo = action.execute();
+                            description = programInfo.getContent().element(
+                                    "description").getText().trim();
+                            keywords = programInfo.getContent().element(
+                                    "keywords").getText().trim();
+                        } catch (PageException e) {
+                            String msg = "Program details could not be determined for '"
+                                    + action.getName() + "'";
+                            aReport.addMessage(msg, e);
+                            LOG.warn(msg, e);
+                        }
+                    } else {
+                        description = descriptionElem.getTextTrim();
+                    }
+                }
+                Program program = new Program(aChannel, action.getName(),
+                        description, keywords, interval, action);
+
+                LOG.info("Got program " + program);
+                programs.add(program);
+            }
+        }
+        return new Channel(aChannel, programs);
+    }
+}