--- /dev/null
+<?xml version="1.0"?>
+
+<!DOCTYPE project [
+ <!ENTITY header SYSTEM "file:../../build/header.xml">
+ <!ENTITY trailer SYSTEM "file:../../build/trailer.xml">
+]>
+
+<project name="crawler" default="jar" basedir=".">
+
+
+ <!-- =============================================================================== -->
+ <!-- Include the build header defining general properties -->
+ <!-- =============================================================================== -->
+ <property name="project.home" value="../.."/>
+ <property name="module.name" value="crawler-kiss" />
+
+ &header;
+
+ <target name="module.build.deps"
+ depends="logging.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d">
+ </target>
+
+ <!-- Set libraries to use in addition for test, a library which
+ is already mentioned in module.build.path should not be
+ mentioned below again -->
+ <target name="module.test.deps" depends="wamblee.support.test.d,wamblee.crawler.test.d">
+ </target>
+
+ &trailer;
+
+
+</project>
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.util.List;
+
+/**
+ *
+ */
+public abstract class AbstractVisitor implements Visitor {
+
+ protected AbstractVisitor() {
+ // Empty
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.Visitor#visitChannel(org.wamblee.crawler.kiss.Channel)
+ */
+ public void visitChannel(Channel aChannel) {
+ List<Program> programs = aChannel.getPrograms();
+ for (Program program: programs) {
+ program.accept(this);
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.Visitor#visitTvGuide(org.wamblee.crawler.kiss.TVGuide)
+ */
+ public void visitTvGuide(TVGuide aGuide) {
+ List<Channel> channels = aGuide.getChannels();
+ for (Channel channel: channels) {
+ channel.accept(this);
+ }
+ }
+
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ *
+ */
+public class Channel {
+
+ private String _name;
+ private List<Program> _programs;
+
+ public Channel(String aName, List<Program> aPrograms) {
+ _name = aName;
+ _programs = aPrograms;
+ }
+
+ public String getName() {
+ return _name;
+ }
+
+ public List<Program> getPrograms() {
+ return Collections.unmodifiableList(_programs);
+ }
+
+ public void accept(Visitor aVisitor) {
+ aVisitor.visitChannel(this);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Element;
+import org.wamblee.conditions.Condition;
+import org.wamblee.conditions.OrCondition;
+import org.wamblee.crawler.Action;
+import org.wamblee.crawler.Configuration;
+import org.wamblee.crawler.Crawler;
+import org.wamblee.crawler.Page;
+import org.wamblee.crawler.impl.ConfigurationParser;
+import org.wamblee.crawler.impl.CrawlerImpl;
+
+/**
+ *
+ */
+public class KissCrawler {
+
+ private static final Log LOG = LogFactory.getLog(KissCrawler.class);
+
+ private static final String LOG_FILE = "kiss.log";
+
+ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
+
+ private static final String CRAWLER_CONFIG = "config.xml";
+
+ private static final String PROGRAM_CONFIG = "programs.xml";
+
+ private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
+
+ private Pattern _pattern;
+
+ public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws Exception {
+
+ _pattern = Pattern.compile(TIME_REGEX);
+
+ FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
+ PrintStream os = new PrintStream(fos);
+
+ try {
+ ConfigurationParser parser = new ConfigurationParser(os);
+ InputStream crawlerConfigFile = new FileInputStream(new File(aCrawlerConfig));
+ Configuration config = parser.parse(crawlerConfigFile);
+
+ InputStream programConfigFile = new FileInputStream(new File(aProgramConfig));
+ Condition<Program> programCondition = new ProgramConfigurationParser().parse(programConfigFile);
+
+
+ HttpClient client = new HttpClient();
+ // client.getHostConfiguration().setProxy("localhost", 3128);
+
+ Crawler crawler = new CrawlerImpl(client, config);
+
+ Page page = crawler.getPage(aStartUrl);
+ showPage(page);
+ page = page.getAction("channels-favorites").execute();
+ TVGuide guide = createGuide(page);
+ PrintVisitor printer = new PrintVisitor(System.out);
+ guide.accept(printer);
+
+ MatchVisitor matcher = new MatchVisitor(programCondition);
+ guide.accept(matcher);
+ List<Program> programs = matcher.getMatches();
+ for (Program program: programs) {
+ System.out.println("Found: " + program + " record: " + program.record() );
+ }
+
+ } finally {
+ os.flush();
+ os.close();
+ System.out.println("Output written on '" + LOG_FILE + "'");
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
+ }
+
+ private void showPage(Page aPage) {
+ Action[] links = aPage.getActions();
+ for (Action link : links) {
+ System.out.println("Link found '" + link.getName() + "'");
+ }
+ Element element = aPage.getContent();
+ System.out.println("Retrieved content: " + element.asXML());
+ }
+
+ private TVGuide createGuide(Page page) {
+ LOG.info("Obtaining full TV guide");
+ Action[] actions = page.getActions();
+ List<Channel> channels = new ArrayList<Channel>();
+ for (Action action : actions) {
+ Channel channel = createChannel(action.getName(), action.execute()
+ .getAction("right-now").execute());
+ channels.add(channel);
+ }
+ return new TVGuide(channels);
+ }
+
+ private Channel createChannel(String aChannel, Page aPage) {
+ LOG.info("Obtaining program for " + aChannel);
+ Action[] programActions = aPage.getActions();
+ List<Program> programs = new ArrayList<Program>();
+ for (Action action : programActions) {
+ String time = action.getContent().element("time").getText().trim();
+ Matcher matcher = _pattern.matcher(time);
+ if (matcher.matches()) {
+ Time begin = new Time(Integer.parseInt(matcher.group(1)),
+ Integer.parseInt(matcher.group(2)));
+ Time end = new Time(Integer.parseInt(matcher.group(3)),
+ Integer.parseInt(matcher.group(4)));
+ TimeInterval interval = new TimeInterval(begin, end);
+ //Page programInfo = action.execute();
+ //String description = programInfo.getContent().element("description").getText().trim();
+ //String keywords = programInfo.getContent().element("keywords").getText().trim();
+ String description = "";
+ String keywords = "";
+ Program program = new Program(aChannel, action.getName(), description, keywords, interval, action);
+
+ LOG.debug("Got program " + program);
+ programs.add(program);
+ }
+ }
+ return new Channel(aChannel, programs);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.wamblee.conditions.Condition;
+
+/**
+ *
+ */
+public class MatchVisitor extends AbstractVisitor {
+
+ private Condition _matcher;
+ private List<Program> _programs;
+
+ public MatchVisitor(Condition aMatcher) {
+ _matcher = aMatcher;
+ _programs = new ArrayList<Program>();
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.Visitor#visitProgram(org.wamblee.crawler.kiss.Program)
+ */
+ public void visitProgram(Program aProgram) {
+ if ( _matcher.matches(aProgram)) {
+ _programs.add(aProgram);
+ }
+ }
+
+ public List<Program> getMatches() {
+ return _programs;
+ }
+
+
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.io.PrintStream;
+
+/**
+ *
+ */
+public class PrintVisitor extends AbstractVisitor {
+
+ private PrintStream _stream;
+
+ public PrintVisitor(PrintStream aStream) {
+ _stream = aStream;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.Visitor#visitProgram(org.wamblee.crawler.kiss.Program)
+ */
+ public void visitProgram(Program aProgram) {
+ _stream.println(" " + aProgram.toString());
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.AbstractVisitor#visitChannel(org.wamblee.crawler.kiss.Channel)
+ */
+ @Override
+ public void visitChannel(Channel aChannel) {
+ System.out.println(aChannel.getName());
+ super.visitChannel(aChannel);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import org.wamblee.crawler.Action;
+
+/**
+ *
+ */
+public class Program {
+
+ private static final String RECORD_ACTION = "record";
+ private static final String INDENT = " ";
+
+ private String _channel;
+ private String _name;
+ private String _description;
+ private String _keywords;
+ private TimeInterval _interval;
+ private Action _programInfo;
+
+ public Program(String aChannel, String aName, String aDescription, String aKeywords, TimeInterval aInterval, Action aProgramInfo) {
+ _channel = aChannel;
+ _name = aName;
+ _description = aDescription;
+ _keywords = aKeywords;
+ _interval = aInterval;
+ _programInfo = aProgramInfo;
+ }
+
+ public String getChannel() {
+ return _channel;
+ }
+
+ public String getName() {
+ return _name;
+ }
+
+ public String getDescription() {
+ return _description;
+ }
+
+ public String getKeywords() {
+ return _keywords;
+ }
+
+ public TimeInterval getInterval() {
+ return _interval;
+ }
+
+ public boolean record() {
+ Action record = _programInfo.execute().getAction(RECORD_ACTION);
+ if ( record == null) {
+ return false;
+ }
+ record.execute();
+ return true;
+ }
+
+ public void accept(Visitor aVisitor) {
+ aVisitor.visitProgram(this);
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return _interval + " - " + _name + " (" + _channel + "/" + _keywords + ")" + "\n" +
+ (INDENT + _description).replaceAll("\n", "\n" + INDENT);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Element;
+import org.dom4j.io.SAXReader;
+import org.wamblee.conditions.Condition;
+import org.wamblee.conditions.OrCondition;
+
+/**
+ * Parse the configuration of desired programs.
+ */
+public class ProgramConfigurationParser {
+
+
+ private static final String ELEM_PROGRAM = "program";
+ private static final String ELEM_PATTERN = "name";
+
+ /**
+ * Parses the condition used to match the desired programs.
+ *
+ * @param aStream
+ * Input stream to parse from.
+ * @return Condition.
+ */
+ Condition<Program> parse(InputStream aStream) {
+ try {
+ SAXReader reader = new SAXReader();
+ Document document = reader.read(aStream);
+
+ Element root = document.getRootElement();
+ List<Condition<Program>> conditions = new ArrayList<Condition<Program>>();
+
+ for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext(); ) {
+ Element program = (Element)i.next();
+ String pattern = ".*" + program.element(ELEM_PATTERN).getText() + ".*";
+ conditions.add(new ProgramNameMatcher(pattern));
+ }
+ return new OrCondition<Program>(conditions);
+ } catch (DocumentException e) {
+ throw new RuntimeException("Error parsing program configuraiton", e);
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.wamblee.conditions.Condition;
+
+/**
+ * Match programs based on a regular expression for the name in lower case.
+ */
+public class ProgramNameMatcher implements Condition<Program> {
+
+ private Pattern _pattern;
+
+ public ProgramNameMatcher(String aPattern) {
+ _pattern = Pattern.compile(aPattern);
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.kiss.ProgramMatcher#matches(org.wamblee.crawler.kiss.Program)
+ */
+ public boolean matches(Program aProgram) {
+ Matcher matcher = _pattern.matcher(aProgram.getName().toLowerCase());
+ return matcher.matches();
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ *
+ */
+public class TVGuide {
+
+ private List<Channel> _channels;
+
+ public TVGuide(List<Channel> aChannels) {
+ _channels = aChannels;
+ }
+
+ public List<Channel> getChannels() {
+ return Collections.unmodifiableList(_channels);
+ }
+
+ public void accept(Visitor aVisitor) {
+ aVisitor.visitTvGuide(this);
+ }
+
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+
+/**
+ *
+ */
+public class Time {
+
+ private int _hour;
+ private int _minute;
+
+ public Time(int aHour, int aMinute) {
+ _hour = aHour;
+ _minute = aMinute;
+ }
+
+ public int getHour() {
+ return _hour;
+ }
+
+ public int getMinute() {
+ return _minute;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ NumberFormat format = new DecimalFormat("00");
+ return format.format(_hour) + ":" + format.format(_minute);
+ }
+
+ float asFloat() {
+ return (float)_hour + (float)_minute/(float)60.0;
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+/**
+ *
+ */
+public class TimeInterval {
+
+ private Time _begin;
+ private Time _end;
+
+ public TimeInterval(Time aBegin, Time aEnd) {
+ _begin = aBegin;
+ _end = aEnd;
+ }
+
+ public Time getBegin() {
+ return _begin;
+ }
+
+ public Time getEnd() {
+ return _end;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return _begin + " - " + _end;
+ }
+
+ /**
+ * Determines if there is an overlap between the current interval and given one.
+ *
+ * @param aInterval Interval to compare with.
+ * @return True iff there is overlap
+ */
+ public boolean overlap(TimeInterval aInterval) {
+
+ if ( isUncertain() || aInterval.isUncertain()) {
+ // Optimistic assume there is no overlap if one of the intervals is uncertain.
+ return false;
+ }
+
+ if ( _end.asFloat() <= aInterval._begin.asFloat() ||
+ aInterval._end.asFloat() <= _begin.asFloat() ) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Determines if the actual time that the program corresponds to is uncertain due to
+ * the representation of a period of more than 24 hours using a 24 hour clock.
+ * @return True iff the interval is uncertain.
+ */
+ boolean isUncertain() {
+ return _begin.asFloat() > _end.asFloat();
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.kiss;
+
+/**
+ *
+ */
+public interface Visitor {
+
+ void visitProgram(Program aProgram);
+
+ void visitChannel(Channel aChannel);
+
+ void visitTvGuide(TVGuide aGuide);
+}