From 8c0b0a2d230139dff25630954e170e3c082395a6 Mon Sep 17 00:00:00 2001 From: Erik Brakkee Date: Sun, 19 Mar 2006 20:11:54 +0000 Subject: [PATCH] --- .classpath | 17 ------------ .../wamblee/crawler/AbstractPageRequest.java | 27 +++++++------------ .../org/wamblee/crawler/GetPageRequest.java | 17 ++---------- .../org/wamblee/crawler/PostPageRequest.java | 16 +---------- .../crawler/impl/ConfigurationParser.java | 4 +-- crawler/kiss/conf/kiss/programs.xml | 7 ++++- .../org/wamblee/crawler/kiss/KissCrawler.java | 2 -- .../src/org/wamblee/crawler/kiss/Program.java | 17 +++++++++++- .../crawler/kiss/ProgramActionExecutor.java | 2 -- .../kiss/ProgramConfigurationParser.java | 11 +++++++- .../crawler/kiss/SystemProperties.java | 11 ++++++++ 11 files changed, 57 insertions(+), 74 deletions(-) diff --git a/.classpath b/.classpath index 5ea6c605..751d4442 100644 --- a/.classpath +++ b/.classpath @@ -6,27 +6,15 @@ - - - - - - - - - - - - @@ -43,9 +31,7 @@ - - @@ -55,10 +41,7 @@ - - - diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index dd9e8ae7..baf9510b 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -19,7 +19,6 @@ package org.wamblee.crawler; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.io.PrintStream; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -62,8 +61,6 @@ public abstract class AbstractPageRequest implements PageRequest { private String _xslt; - private PrintStream _os; - /** * Constructs the request. * @@ -75,11 +72,9 @@ public abstract class AbstractPageRequest implements PageRequest { * Request parameters to use. * @param aXslt * XSLT used to convert the response. - * @param aOs - * Output stream for logging (if null then no logging is done). */ protected AbstractPageRequest(int aMaxTries, int aMaxDelay, - NameValuePair[] aParams, String aXslt, PrintStream aOs) { + NameValuePair[] aParams, String aXslt) { if (aParams == null) { throw new IllegalArgumentException("aParams is null"); } @@ -90,7 +85,6 @@ public abstract class AbstractPageRequest implements PageRequest { _maxDelay = aMaxDelay; _params = aParams; _xslt = aXslt; - _os = aOs; } /* @@ -163,14 +157,14 @@ public abstract class AbstractPageRequest implements PageRequest { Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt))); - _os.println("Transformed result is: "); + ByteArrayOutputStream os = new ByteArrayOutputStream(); Transformer transformer = TransformerFactory.newInstance() .newTransformer(); transformer.setParameter(OutputKeys.INDENT, "yes"); transformer.setParameter(OutputKeys.METHOD, "xml"); transformer.transform(new DOMSource(transformed), new StreamResult( - _os)); - + os)); + LOG.debug("Transformed result is \n" + os.toString()); return transformed; } catch (TransformerConfigurationException e) { throw new RuntimeException(e.getMessage(), e); @@ -195,24 +189,21 @@ public abstract class AbstractPageRequest implements PageRequest { tidy.setXHTML(true); tidy.setQuiet(true); tidy.setShowWarnings(false); - if (_os != null) { - _os.println("Content of '" + aMethod.getURI() + "'"); - _os.println(); - } + // We write the jtidy output to XML since the DOM tree it produces is // not namespace aware and namespace awareness is required by XSLT. // An alternative is to configure namespace awareness of the XML parser // in a system wide way. - Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), _os); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os); DOMUtility.removeDuplicateAttributes(w3cDoc); + LOG.debug("Content of response is \n" + os.toString()); ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); serializer.serialize(w3cDoc); xhtml.flush(); - if (_os != null) { - _os.println(); - } + return xhtml.toByteArray(); } diff --git a/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java b/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java index 2ce267ee..1d92b024 100644 --- a/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java @@ -17,7 +17,6 @@ package org.wamblee.crawler; import java.io.IOException; -import java.io.PrintStream; import javax.xml.transform.TransformerException; @@ -40,21 +39,9 @@ public class GetPageRequest extends AbstractPageRequest { * @param aXslt XSLT to use. */ public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) { - super(aMaxTries, aMaxDelay, aParams, aXslt, null); + super(aMaxTries, aMaxDelay, aParams, aXslt); } - - /** - * Constructs the request. - * @param aMaxTries Maximum number of retries. - * @param aMaxDelay Maximum delay before executing the request. - * @param aParams Request parameters to use. - * @param aXslt XSLT to use. - * @param aOs Logging output stream to use. - */ - public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) { - super(aMaxTries, aMaxDelay, aParams, aXslt, aOs); - } - + /* * (non-Javadoc) * diff --git a/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java b/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java index 2bb7dc91..db1f2eb9 100644 --- a/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java @@ -17,7 +17,6 @@ package org.wamblee.crawler; import java.io.IOException; -import java.io.PrintStream; import javax.xml.transform.TransformerException; @@ -39,20 +38,7 @@ public class PostPageRequest extends AbstractPageRequest { * @param aXslt XSLT to use. */ public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) { - super(aMaxTries, aMaxDelay, aParams, aXslt, null); - } - - /** - * Constructs the request. - * @param aMaxTries Maximum number of retries. - * @param aMaxDelay Maximum delay before executing the request. - * @param aParams Request parameters to use. - * @param aXslt XSLT to use. - * @param aOs Logging output stream to use. - */ - public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, - PrintStream aOs) { - super(aMaxTries, aMaxDelay, aParams, aXslt, aOs); + super(aMaxTries, aMaxDelay, aParams, aXslt); } /* diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java index 6795bf1c..67934851 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -162,10 +162,10 @@ public class ConfigurationParser { PageRequest request; if (METHOD_POST.equals(method)) { request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, - xslt, _os); + xslt); } else if (METHOD_GET.equals(method) || method == null) { request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, - xslt, _os); + xslt); } else { throw new RuntimeException("Unknown request method '" + method + "'. Only " + METHOD_GET + " and " + METHOD_POST diff --git a/crawler/kiss/conf/kiss/programs.xml b/crawler/kiss/conf/kiss/programs.xml index 0b4b8e7a..c0bf4e83 100644 --- a/crawler/kiss/conf/kiss/programs.xml +++ b/crawler/kiss/conf/kiss/programs.xml @@ -1,22 +1,26 @@ - + + horror notify horror + films notify film horror|actie|thriller + science fiction notify (sci-fi)|(science fiction) + documentaires notify (zembla)|(uur.*wolf) @@ -46,6 +50,7 @@ + notify brainiac diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java index 7a0d421d..29d7c849 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java @@ -27,8 +27,6 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Properties; -import java.util.Set; -import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java b/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java index 2358180a..89c3c88c 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java @@ -29,13 +29,25 @@ import org.wamblee.crawler.PageException; */ public class Program { + /** + * Lexicographical comparison of programs based on (time, title, channel). + * + */ public static class TimeSorter implements Comparator { /* (non-Javadoc) * @see java.util.Comparator#compare(T, T) */ public int compare(Program o1, Program o2) { - return o1.getInterval().getBegin().compareTo(o2.getInterval().getBegin()); + int value = o1.getInterval().getBegin().compareTo(o2.getInterval().getBegin()); + if ( value != 0 ) { + return value; + } + value = o1.getName().compareTo(o2.getName()); + if (value != 0 ) { + return value; + } + return o1.getChannel().compareTo(o2.getChannel()); } } @@ -197,6 +209,9 @@ public class Program { */ public RecordingResult record() { LOG.info("Recording " + this); + if ( SystemProperties.isRecordDisabled() ) { + return RecordingResult.OK; + } try { Action record = _programInfo.execute().getAction(RECORD_ACTION); if (record == null) { diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramActionExecutor.java b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramActionExecutor.java index 3865dcc8..8303449e 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramActionExecutor.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramActionExecutor.java @@ -16,9 +16,7 @@ package org.wamblee.crawler.kiss; -import java.util.ArrayList; import java.util.EnumMap; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java index 5d9f578d..e9ba272e 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java @@ -40,6 +40,8 @@ class ProgramConfigurationParser { private static final String ELEM_PATTERN = "match"; private static final String ELEM_ACTION = "action"; + + private static final String ELEM_CATEGORY = "category"; private static final String ACTION_NOTIFY = "notify"; @@ -61,13 +63,20 @@ class ProgramConfigurationParser { for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext();) { Element program = (Element) i.next(); + Element categoryElem = program.element(ELEM_CATEGORY); + String category = ""; + if ( categoryElem != null ) { + category = categoryElem.getText().trim(); + } + Element actionElem = program.element(ELEM_ACTION); ProgramAction action = new RecordProgramAction(); if (actionElem != null) { if (actionElem.getText().equals(ACTION_NOTIFY)) { - action = new InterestingProgramAction(""); + action = new InterestingProgramAction(category); } } + List> regexConditions = new ArrayList>(); for (Iterator j = program.elementIterator(ELEM_PATTERN); j.hasNext(); ) { diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/SystemProperties.java b/crawler/kiss/src/org/wamblee/crawler/kiss/SystemProperties.java index 60bafa14..20af2a93 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/SystemProperties.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/SystemProperties.java @@ -23,6 +23,7 @@ public final class SystemProperties { private static final String DEBUG_PROPERTY = "kiss.debug"; private static final String NO_PROGRAM_DETAILS = "kiss.nodetails"; + private static final String DISABLE_RECORD = "kiss.norecord"; /** * Disabled constructor. @@ -47,4 +48,14 @@ public final class SystemProperties { public static boolean isNoProgramDetailsRequired() { return System.getProperties().getProperty(NO_PROGRAM_DETAILS) != null; } + + + /** + * Determines if recording is disabled. + * @return True iff no recording should be done. + */ + public static boolean isRecordDisabled() { + return System.getProperties().getProperty(DISABLE_RECORD) != null; + } + } -- 2.31.1