/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.kiss; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Properties; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.Transport; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; import org.wamblee.crawler.PageException; import org.wamblee.crawler.impl.ConfigurationParser; import org.wamblee.crawler.impl.CrawlerImpl; /** * The KiSS crawler for automatic recording of interesting TV shows. * */ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); /** * Log file name for the crawler. */ private static final String LOG_FILE = "kiss.log"; /** * Start URL of the electronic programme guide. */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; /** * Crawler configuration file. */ private static final String CRAWLER_CONFIG = "config.xml"; /** * Configuration file describing interesting programs. */ private static final String PROGRAM_CONFIG = "programs.xml"; /** * Regular expression for matching time interval strings in the retrieved * pages. */ private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; /** * Compiled pattern for the time regular expression. */ private Pattern _pattern; /** * Runs the KiSS crawler. * * @param aArgs * Arguments, currently all ignored because they are hardcoded. * @throws Exception * In case of problems. */ public static void main(String[] aArgs) throws Exception { new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); } /** * Constructs the crawler. This retrieves the TV guide by crawling the KiSS * EPG guide, filters the guide for interesting programs, tries to record * them, and sends a summary mail to the user. * * @param aStartUrl * Start URL of the electronic programme guide. * @param aCrawlerConfig * Configuration file for the crawler. * @param aProgramConfig * Configuration file describing interesting shows. * @throws IOException * In case of problems reading files. * @throws MessagingException * In case of problems sending a mail notification. */ public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws IOException, MessagingException { _pattern = Pattern.compile(TIME_REGEX); FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); PrintStream os = new PrintStream(fos); try { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("127.0.0.1", 3128); Crawler crawler = createCrawler(aCrawlerConfig, os, client); InputStream programConfigFile = new FileInputStream(new File( aProgramConfig)); List programFilters = new ProgramConfigurationParser() .parse(programConfigFile); Page page = getStartPage(aStartUrl, crawler); TVGuide guide = createGuide(page); PrintVisitor printer = new PrintVisitor(System.out); guide.accept(printer); processResults(programFilters, guide); } finally { os.flush(); os.close(); System.out.println("Output written on '" + LOG_FILE + "'"); } } /** * Records interesting shows. * * @param aProgramCondition * Condition determining which shows are interesting. * @param aGuide * Television guide. * @throws MessagingException * In case of problems sending a summary mail. */ private void processResults(List aProgramCondition, TVGuide aGuide) throws MessagingException { ProgramActionExecutor executor = new ProgramActionExecutor(); for (ProgramFilter filter : aProgramCondition) { List programs = filter.apply(aGuide); ProgramAction action = filter.getAction(); for (Program program: programs) { action.execute(program, executor); } } executor.commit(); String msg = executor.getReport(); System.out.println(msg); sendMail(msg); } /** * Creates the crawler. * * @param aCrawlerConfig * Crawler configuration file. * @param aOs * Logging output stream for the crawler. * @param aClient * HTTP Client to use. * @return Crawler. * @throws FileNotFoundException * In case configuration files cannot be found. */ private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs, HttpClient aClient) throws FileNotFoundException { ConfigurationParser parser = new ConfigurationParser(aOs); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); Crawler crawler = new CrawlerImpl(aClient, config); return crawler; } /** * Gets the start page of the electronic programme guide. This involves * login and navigation to a suitable start page after logging in. * * @param aStartUrl * URL of the electronic programme guide. * @param aCrawler * Crawler to use. * @return Starting page. */ private Page getStartPage(String aStartUrl, Crawler aCrawler) { try { Page page = aCrawler.getPage(aStartUrl); return page.getAction("channels-favorites").execute(); } catch (PageException e) { throw new RuntimeException( "Could not login to electronic program guide", e); } } /** * Creates the TV guide by web crawling. * * @param aPage * Starting page. * @return TV guide. */ private TVGuide createGuide(Page aPage) { LOG.info("Obtaining full TV guide"); Action[] actions = aPage.getActions(); List channels = new ArrayList(); for (Action action : actions) { try { LOG.info("Getting channel info for '" + action.getName() + "'"); Channel channel = createChannel(action.getName(), action .execute().getAction("right-now").execute()); channels.add(channel); if (SystemProperties.isDebugMode()) { break; // Only one channel is crawled. } } catch (PageException e) { LOG.error("Could not create channel information for '" + action.getName() + "'", e); } } return new TVGuide(channels); } /** * Create channel information for a specific channel. * * @param aChannel * Channel name. * @param aPage * Starting page for the channel. * @return Channel. */ private Channel createChannel(String aChannel, Page aPage) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); List programs = new ArrayList(); for (Action action : programActions) { String time = action.getContent().element("time").getText().trim(); Matcher matcher = _pattern.matcher(time); if (matcher.matches()) { Time begin = new Time(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))); Time end = new Time(Integer.parseInt(matcher.group(3)), Integer .parseInt(matcher.group(4))); TimeInterval interval = new TimeInterval(begin, end); String description = ""; String keywords = ""; if (!SystemProperties.isNoProgramDetailsRequired()) { try { Page programInfo = action.execute(); description = programInfo.getContent().element( "description").getText().trim(); keywords = programInfo.getContent().element("keywords") .getText().trim(); } catch (PageException e) { LOG .warn("Program details could not be determined for '" + action.getName() + "'", e); } } Program program = new Program(aChannel, action.getName(), description, keywords, interval, action); LOG.info("Got program " + program); programs.add(program); } } return new Channel(aChannel, programs); } /** * Sends a summary mail to the user. * * @param aText * Text of the mail. * @throws MessagingException * In case of problems sending mail. */ private void sendMail(String aText) throws MessagingException { Properties props = new Properties(); props.put("mail.transport.protocol", "smtp"); props.put("mail.smtp.host", "falcon"); props.put("mail.smtp.port", "25"); Session mailSession = Session.getInstance(props); Message message = new MimeMessage(mailSession); message.setFrom(new InternetAddress("erik@brakkee.org")); message.setRecipient(Message.RecipientType.TO, new InternetAddress( "erik@brakkee.org")); message.setSentDate(new Date()); message.setSubject("KiSS crawler update"); message.setText(aText); Transport.send(message); } }