From: Erik Brakkee Date: Tue, 14 Mar 2006 20:53:38 +0000 (+0000) Subject: (no commit message) X-Git-Tag: wamblee-utils-0.7~1159 X-Git-Url: http://wamblee.org/gitweb/?a=commitdiff_plain;h=abee5af7177fb97dab546e49d1790c918b9a466e;p=utils --- diff --git a/crawler/kiss/build.xml b/crawler/kiss/build.xml new file mode 100644 index 00000000..488ea57d --- /dev/null +++ b/crawler/kiss/build.xml @@ -0,0 +1,32 @@ + + + + +]> + + + + + + + + + + + &header; + + + + + + + + + &trailer; + + + diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/AbstractVisitor.java b/crawler/kiss/src/org/wamblee/crawler/kiss/AbstractVisitor.java new file mode 100644 index 00000000..266878b6 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/AbstractVisitor.java @@ -0,0 +1,50 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.util.List; + +/** + * + */ +public abstract class AbstractVisitor implements Visitor { + + protected AbstractVisitor() { + // Empty + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.Visitor#visitChannel(org.wamblee.crawler.kiss.Channel) + */ + public void visitChannel(Channel aChannel) { + List programs = aChannel.getPrograms(); + for (Program program: programs) { + program.accept(this); + } + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.Visitor#visitTvGuide(org.wamblee.crawler.kiss.TVGuide) + */ + public void visitTvGuide(TVGuide aGuide) { + List channels = aGuide.getChannels(); + for (Channel channel: channels) { + channel.accept(this); + } + } + +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/Channel.java b/crawler/kiss/src/org/wamblee/crawler/kiss/Channel.java new file mode 100644 index 00000000..140a3d1c --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/Channel.java @@ -0,0 +1,46 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.util.Collections; +import java.util.List; + +/** + * + */ +public class Channel { + + private String _name; + private List _programs; + + public Channel(String aName, List aPrograms) { + _name = aName; + _programs = aPrograms; + } + + public String getName() { + return _name; + } + + public List getPrograms() { + return Collections.unmodifiableList(_programs); + } + + public void accept(Visitor aVisitor) { + aVisitor.visitChannel(this); + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java new file mode 100644 index 00000000..dd9ba78d --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java @@ -0,0 +1,154 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Element; +import org.wamblee.conditions.Condition; +import org.wamblee.conditions.OrCondition; +import org.wamblee.crawler.Action; +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.Crawler; +import org.wamblee.crawler.Page; +import org.wamblee.crawler.impl.ConfigurationParser; +import org.wamblee.crawler.impl.CrawlerImpl; + +/** + * + */ +public class KissCrawler { + + private static final Log LOG = LogFactory.getLog(KissCrawler.class); + + private static final String LOG_FILE = "kiss.log"; + + private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; + + private static final String CRAWLER_CONFIG = "config.xml"; + + private static final String PROGRAM_CONFIG = "programs.xml"; + + private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; + + private Pattern _pattern; + + public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws Exception { + + _pattern = Pattern.compile(TIME_REGEX); + + FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); + PrintStream os = new PrintStream(fos); + + try { + ConfigurationParser parser = new ConfigurationParser(os); + InputStream crawlerConfigFile = new FileInputStream(new File(aCrawlerConfig)); + Configuration config = parser.parse(crawlerConfigFile); + + InputStream programConfigFile = new FileInputStream(new File(aProgramConfig)); + Condition programCondition = new ProgramConfigurationParser().parse(programConfigFile); + + + HttpClient client = new HttpClient(); + // client.getHostConfiguration().setProxy("localhost", 3128); + + Crawler crawler = new CrawlerImpl(client, config); + + Page page = crawler.getPage(aStartUrl); + showPage(page); + page = page.getAction("channels-favorites").execute(); + TVGuide guide = createGuide(page); + PrintVisitor printer = new PrintVisitor(System.out); + guide.accept(printer); + + MatchVisitor matcher = new MatchVisitor(programCondition); + guide.accept(matcher); + List programs = matcher.getMatches(); + for (Program program: programs) { + System.out.println("Found: " + program + " record: " + program.record() ); + } + + } finally { + os.flush(); + os.close(); + System.out.println("Output written on '" + LOG_FILE + "'"); + } + } + + public static void main(String[] args) throws Exception { + new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); + } + + private void showPage(Page aPage) { + Action[] links = aPage.getActions(); + for (Action link : links) { + System.out.println("Link found '" + link.getName() + "'"); + } + Element element = aPage.getContent(); + System.out.println("Retrieved content: " + element.asXML()); + } + + private TVGuide createGuide(Page page) { + LOG.info("Obtaining full TV guide"); + Action[] actions = page.getActions(); + List channels = new ArrayList(); + for (Action action : actions) { + Channel channel = createChannel(action.getName(), action.execute() + .getAction("right-now").execute()); + channels.add(channel); + } + return new TVGuide(channels); + } + + private Channel createChannel(String aChannel, Page aPage) { + LOG.info("Obtaining program for " + aChannel); + Action[] programActions = aPage.getActions(); + List programs = new ArrayList(); + for (Action action : programActions) { + String time = action.getContent().element("time").getText().trim(); + Matcher matcher = _pattern.matcher(time); + if (matcher.matches()) { + Time begin = new Time(Integer.parseInt(matcher.group(1)), + Integer.parseInt(matcher.group(2))); + Time end = new Time(Integer.parseInt(matcher.group(3)), + Integer.parseInt(matcher.group(4))); + TimeInterval interval = new TimeInterval(begin, end); + //Page programInfo = action.execute(); + //String description = programInfo.getContent().element("description").getText().trim(); + //String keywords = programInfo.getContent().element("keywords").getText().trim(); + String description = ""; + String keywords = ""; + Program program = new Program(aChannel, action.getName(), description, keywords, interval, action); + + LOG.debug("Got program " + program); + programs.add(program); + } + } + return new Channel(aChannel, programs); + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/MatchVisitor.java b/crawler/kiss/src/org/wamblee/crawler/kiss/MatchVisitor.java new file mode 100644 index 00000000..a0574568 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/MatchVisitor.java @@ -0,0 +1,51 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.util.ArrayList; +import java.util.List; + +import org.wamblee.conditions.Condition; + +/** + * + */ +public class MatchVisitor extends AbstractVisitor { + + private Condition _matcher; + private List _programs; + + public MatchVisitor(Condition aMatcher) { + _matcher = aMatcher; + _programs = new ArrayList(); + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.Visitor#visitProgram(org.wamblee.crawler.kiss.Program) + */ + public void visitProgram(Program aProgram) { + if ( _matcher.matches(aProgram)) { + _programs.add(aProgram); + } + } + + public List getMatches() { + return _programs; + } + + +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/PrintVisitor.java b/crawler/kiss/src/org/wamblee/crawler/kiss/PrintVisitor.java new file mode 100644 index 00000000..387782b3 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/PrintVisitor.java @@ -0,0 +1,47 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.io.PrintStream; + +/** + * + */ +public class PrintVisitor extends AbstractVisitor { + + private PrintStream _stream; + + public PrintVisitor(PrintStream aStream) { + _stream = aStream; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.Visitor#visitProgram(org.wamblee.crawler.kiss.Program) + */ + public void visitProgram(Program aProgram) { + _stream.println(" " + aProgram.toString()); + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.AbstractVisitor#visitChannel(org.wamblee.crawler.kiss.Channel) + */ + @Override + public void visitChannel(Channel aChannel) { + System.out.println(aChannel.getName()); + super.visitChannel(aChannel); + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java b/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java new file mode 100644 index 00000000..765ddfe1 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java @@ -0,0 +1,86 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import org.wamblee.crawler.Action; + +/** + * + */ +public class Program { + + private static final String RECORD_ACTION = "record"; + private static final String INDENT = " "; + + private String _channel; + private String _name; + private String _description; + private String _keywords; + private TimeInterval _interval; + private Action _programInfo; + + public Program(String aChannel, String aName, String aDescription, String aKeywords, TimeInterval aInterval, Action aProgramInfo) { + _channel = aChannel; + _name = aName; + _description = aDescription; + _keywords = aKeywords; + _interval = aInterval; + _programInfo = aProgramInfo; + } + + public String getChannel() { + return _channel; + } + + public String getName() { + return _name; + } + + public String getDescription() { + return _description; + } + + public String getKeywords() { + return _keywords; + } + + public TimeInterval getInterval() { + return _interval; + } + + public boolean record() { + Action record = _programInfo.execute().getAction(RECORD_ACTION); + if ( record == null) { + return false; + } + record.execute(); + return true; + } + + public void accept(Visitor aVisitor) { + aVisitor.visitProgram(this); + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return _interval + " - " + _name + " (" + _channel + "/" + _keywords + ")" + "\n" + + (INDENT + _description).replaceAll("\n", "\n" + INDENT); + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java new file mode 100644 index 00000000..71719799 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java @@ -0,0 +1,65 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.wamblee.conditions.Condition; +import org.wamblee.conditions.OrCondition; + +/** + * Parse the configuration of desired programs. + */ +public class ProgramConfigurationParser { + + + private static final String ELEM_PROGRAM = "program"; + private static final String ELEM_PATTERN = "name"; + + /** + * Parses the condition used to match the desired programs. + * + * @param aStream + * Input stream to parse from. + * @return Condition. + */ + Condition parse(InputStream aStream) { + try { + SAXReader reader = new SAXReader(); + Document document = reader.read(aStream); + + Element root = document.getRootElement(); + List> conditions = new ArrayList>(); + + for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext(); ) { + Element program = (Element)i.next(); + String pattern = ".*" + program.element(ELEM_PATTERN).getText() + ".*"; + conditions.add(new ProgramNameMatcher(pattern)); + } + return new OrCondition(conditions); + } catch (DocumentException e) { + throw new RuntimeException("Error parsing program configuraiton", e); + } + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramNameMatcher.java b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramNameMatcher.java new file mode 100644 index 00000000..cd36079c --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramNameMatcher.java @@ -0,0 +1,42 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.wamblee.conditions.Condition; + +/** + * Match programs based on a regular expression for the name in lower case. + */ +public class ProgramNameMatcher implements Condition { + + private Pattern _pattern; + + public ProgramNameMatcher(String aPattern) { + _pattern = Pattern.compile(aPattern); + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.kiss.ProgramMatcher#matches(org.wamblee.crawler.kiss.Program) + */ + public boolean matches(Program aProgram) { + Matcher matcher = _pattern.matcher(aProgram.getName().toLowerCase()); + return matcher.matches(); + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/TVGuide.java b/crawler/kiss/src/org/wamblee/crawler/kiss/TVGuide.java new file mode 100644 index 00000000..fb73f750 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/TVGuide.java @@ -0,0 +1,41 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.util.Collections; +import java.util.List; + +/** + * + */ +public class TVGuide { + + private List _channels; + + public TVGuide(List aChannels) { + _channels = aChannels; + } + + public List getChannels() { + return Collections.unmodifiableList(_channels); + } + + public void accept(Visitor aVisitor) { + aVisitor.visitTvGuide(this); + } + +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/Time.java b/crawler/kiss/src/org/wamblee/crawler/kiss/Time.java new file mode 100644 index 00000000..b2f95f05 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/Time.java @@ -0,0 +1,55 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +import java.text.DecimalFormat; +import java.text.NumberFormat; + +/** + * + */ +public class Time { + + private int _hour; + private int _minute; + + public Time(int aHour, int aMinute) { + _hour = aHour; + _minute = aMinute; + } + + public int getHour() { + return _hour; + } + + public int getMinute() { + return _minute; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + NumberFormat format = new DecimalFormat("00"); + return format.format(_hour) + ":" + format.format(_minute); + } + + float asFloat() { + return (float)_hour + (float)_minute/(float)60.0; + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/TimeInterval.java b/crawler/kiss/src/org/wamblee/crawler/kiss/TimeInterval.java new file mode 100644 index 00000000..1e0e5151 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/TimeInterval.java @@ -0,0 +1,77 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +/** + * + */ +public class TimeInterval { + + private Time _begin; + private Time _end; + + public TimeInterval(Time aBegin, Time aEnd) { + _begin = aBegin; + _end = aEnd; + } + + public Time getBegin() { + return _begin; + } + + public Time getEnd() { + return _end; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return _begin + " - " + _end; + } + + /** + * Determines if there is an overlap between the current interval and given one. + * + * @param aInterval Interval to compare with. + * @return True iff there is overlap + */ + public boolean overlap(TimeInterval aInterval) { + + if ( isUncertain() || aInterval.isUncertain()) { + // Optimistic assume there is no overlap if one of the intervals is uncertain. + return false; + } + + if ( _end.asFloat() <= aInterval._begin.asFloat() || + aInterval._end.asFloat() <= _begin.asFloat() ) { + return false; + } + + return true; + } + + /** + * Determines if the actual time that the program corresponds to is uncertain due to + * the representation of a period of more than 24 hours using a 24 hour clock. + * @return True iff the interval is uncertain. + */ + boolean isUncertain() { + return _begin.asFloat() > _end.asFloat(); + } +} diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/Visitor.java b/crawler/kiss/src/org/wamblee/crawler/kiss/Visitor.java new file mode 100644 index 00000000..df9be674 --- /dev/null +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/Visitor.java @@ -0,0 +1,29 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.kiss; + +/** + * + */ +public interface Visitor { + + void visitProgram(Program aProgram); + + void visitChannel(Channel aChannel); + + void visitTvGuide(TVGuide aGuide); +}