/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.kiss; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.Transport; import javax.mail.internet.AddressException; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Element; import org.wamblee.conditions.Condition; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; import org.wamblee.crawler.PageException; import org.wamblee.crawler.impl.ConfigurationParser; import org.wamblee.crawler.impl.CrawlerImpl; /** * */ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); private static final String LOG_FILE = "kiss.log"; private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; private static final String CRAWLER_CONFIG = "config.xml"; private static final String PROGRAM_CONFIG = "programs.xml"; private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; private Pattern _pattern; public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws IOException, AddressException, MessagingException { _pattern = Pattern.compile(TIME_REGEX); FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); PrintStream os = new PrintStream(fos); try { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("localhost", 3128); Crawler crawler = createCrawler(aCrawlerConfig, os, client); Page page = getStartPage(aStartUrl, crawler); TVGuide guide = createGuide(page); PrintVisitor printer = new PrintVisitor(System.out); guide.accept(printer); InputStream programConfigFile = new FileInputStream(new File( aProgramConfig)); Condition programCondition = new ProgramConfigurationParser() .parse(programConfigFile); recordInterestingShows(programCondition, guide); } finally { os.flush(); os.close(); System.out.println("Output written on '" + LOG_FILE + "'"); } } /** * @param programCondition * @param guide * @throws AddressException * @throws MessagingException */ private void recordInterestingShows(Condition programCondition, TVGuide guide) throws AddressException, MessagingException { MatchVisitor matcher = new MatchVisitor(programCondition); guide.accept(matcher); List programs = matcher.getMatches(); String recorded = ""; String notRecorded = ""; String failures = ""; for (Program program : programs) { try { boolean result = program.record(); if (result) { recorded += "\n" + program; } else { notRecorded += "\n" + program; } } catch (PageException e) { LOG.info("Attempt to record " + program + " failed."); failures += "\n" + program.toString() + ": " + e.getMessage(); } } String msg = "Summary of KiSS crawler: \n\n\n"; if (recorded.length() > 0) { msg += "Recorded programs:\n\n" + recorded + "\n\n"; } if (notRecorded.length() > 0) { msg += "Not recorded programs:\n\n" + notRecorded + "\n\n"; } if (recorded.length() == 0 && notRecorded.length() == 0) { msg += "No suitable programs found"; } if (failures.length() > 0) { msg += "Failures:\n\n" + failures; } System.out.println(msg); sendMail(msg); } /** * @param aCrawlerConfig * @param os * @param client * @return * @throws FileNotFoundException */ private Crawler createCrawler(String aCrawlerConfig, PrintStream os, HttpClient client) throws FileNotFoundException { ConfigurationParser parser = new ConfigurationParser(os); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); Crawler crawler = new CrawlerImpl(client, config); return crawler; } /** * @param aStartUrl * @param crawler * @return */ private Page getStartPage(String aStartUrl, Crawler crawler) { try { Page page = crawler.getPage(aStartUrl); return page.getAction("channels-favorites").execute(); } catch (PageException e) { throw new RuntimeException( "Could not login to electronic program guide", e); } } public static void main(String[] args) throws Exception { new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); } private void showPage(Page aPage) { Action[] links = aPage.getActions(); for (Action link : links) { System.out.println("Link found '" + link.getName() + "'"); } Element element = aPage.getContent(); System.out.println("Retrieved content: " + element.asXML()); } private TVGuide createGuide(Page page) { LOG.info("Obtaining full TV guide"); Action[] actions = page.getActions(); List channels = new ArrayList(); for (Action action : actions) { try { LOG.info("Getting channel info for '" + action.getName() + "'"); Channel channel = createChannel(action.getName(), action .execute().getAction("right-now").execute()); channels.add(channel); } catch (PageException e) { LOG.error("Could not create channel information for '" + action.getName() + "'", e); } } return new TVGuide(channels); } private Channel createChannel(String aChannel, Page aPage) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); List programs = new ArrayList(); for (Action action : programActions) { String time = action.getContent().element("time").getText().trim(); Matcher matcher = _pattern.matcher(time); if (matcher.matches()) { Time begin = new Time(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))); Time end = new Time(Integer.parseInt(matcher.group(3)), Integer .parseInt(matcher.group(4))); TimeInterval interval = new TimeInterval(begin, end); // Page programInfo = action.execute(); // String description = // programInfo.getContent().element("description").getText().trim(); // String keywords = // programInfo.getContent().element("keywords").getText().trim(); String description = ""; String keywords = ""; Program program = new Program(aChannel, action.getName(), description, keywords, interval, action); LOG.debug("Got program " + program); programs.add(program); } } return new Channel(aChannel, programs); } private void sendMail(String aText) throws AddressException, MessagingException { Properties props = new Properties(); props.put("mail.transport.protocol", "smtp"); props.put("mail.smtp.host", "falcon"); props.put("mail.smtp.port", "25"); Session mailSession = Session.getInstance(props); Message message = new MimeMessage(mailSession); message.setFrom(new InternetAddress("erik@brakkee.org")); message.setRecipient(Message.RecipientType.TO, new InternetAddress( "erik@brakkee.org")); message.setSentDate(new Date()); message.setSubject("KiSS crawler update"); message.setText(aText); Transport.send(message); } }