(no commit message)
authorerik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Sun, 19 Mar 2006 20:11:54 +0000 (20:11 +0000)
committererik <erik@77661180-640e-0410-b3a8-9f9b13e6d0e0>
Sun, 19 Mar 2006 20:11:54 +0000 (20:11 +0000)
.classpath
crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java
crawler/basic/src/org/wamblee/crawler/GetPageRequest.java
crawler/basic/src/org/wamblee/crawler/PostPageRequest.java
crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java
crawler/kiss/conf/kiss/programs.xml
crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java
crawler/kiss/src/org/wamblee/crawler/kiss/Program.java
crawler/kiss/src/org/wamblee/crawler/kiss/ProgramActionExecutor.java
crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java
crawler/kiss/src/org/wamblee/crawler/kiss/SystemProperties.java

index 5ea6c60535138d68738f7847371796918c7cf183..751d4442436a23be9b4c3849f2d699d2dec14cdc 100644 (file)
@@ -6,27 +6,15 @@
        <classpathentry output="support/bin" kind="src" path="support/src"/>
        <classpathentry output="support/testbin" kind="src" path="support/test"/>
        <classpathentry kind="lib" path="crawler/basic/lib/external/commons-httpclient-3.0.jar"/>
-       <classpathentry kind="lib" path="crawler/basic/lib/external/commons-logging-1.0.2.jar"/>
-       <classpathentry kind="lib" path="crawler/basic/lib/external/dom4j-1.6.jar"/>
        <classpathentry kind="lib" path="crawler/basic/lib/external/jtidy-4aug2000r7-dev.jar"/>
-       <classpathentry kind="lib" path="crawler/basic/lib/external/log4j-1.2.9.jar"/>
        <classpathentry kind="lib" path="crawler/basic/lib/test/dbunit-2.1.jar"/>
-       <classpathentry kind="lib" path="crawler/basic/lib/test/emma_ant-2.0.5312.jar"/>
-       <classpathentry kind="lib" path="crawler/basic/lib/test/emma-2.0.5312.jar"/>
-       <classpathentry kind="lib" path="crawler/basic/lib/test/jdbc2_0-stdext.jar"/>
        <classpathentry kind="lib" path="crawler/basic/lib/test/jmock-1.0.1.jar"/>
        <classpathentry kind="lib" path="crawler/basic/lib/test/jmock-cglib-1.0.1.jar"/>
        <classpathentry kind="lib" path="crawler/basic/lib/test/jta.jar"/>
        <classpathentry kind="lib" path="crawler/basic/lib/test/junit-3.8.1.jar"/>
        <classpathentry kind="lib" path="crawler/kiss/lib/external/commons-httpclient-3.0.jar"/>
-       <classpathentry kind="lib" path="crawler/kiss/lib/external/commons-logging-1.0.2.jar"/>
-       <classpathentry kind="lib" path="crawler/kiss/lib/external/dom4j-1.6.jar"/>
        <classpathentry kind="lib" path="crawler/kiss/lib/external/jtidy-4aug2000r7-dev.jar"/>
-       <classpathentry kind="lib" path="crawler/kiss/lib/external/log4j-1.2.9.jar"/>
        <classpathentry kind="lib" path="crawler/kiss/lib/test/dbunit-2.1.jar"/>
-       <classpathentry kind="lib" path="crawler/kiss/lib/test/emma_ant-2.0.5312.jar"/>
-       <classpathentry kind="lib" path="crawler/kiss/lib/test/emma-2.0.5312.jar"/>
-       <classpathentry kind="lib" path="crawler/kiss/lib/test/jdbc2_0-stdext.jar"/>
        <classpathentry kind="lib" path="crawler/kiss/lib/test/jmock-1.0.1.jar"/>
        <classpathentry kind="lib" path="crawler/kiss/lib/test/jmock-cglib-1.0.1.jar"/>
        <classpathentry kind="lib" path="crawler/kiss/lib/test/jta.jar"/>
@@ -43,9 +31,7 @@
        <classpathentry kind="lib" path="lib/special/hibernate/antlr-2.7.5H3.jar"/>
        <classpathentry kind="lib" path="lib/special/hibernate/asm.jar"/>
        <classpathentry kind="lib" path="lib/special/hibernate/asm-attrs.jar"/>
-       <classpathentry kind="lib" path="lib/special/test/jdbc2_0-stdext.jar"/>
        <classpathentry kind="lib" path="lib/special/test/jta.jar"/>
-       <classpathentry kind="lib" path="support/lib/external/commons-logging-1.0.2.jar"/>
        <classpathentry kind="lib" path="support/lib/external/dom4j-1.6.jar"/>
        <classpathentry kind="lib" path="support/lib/external/ehcache-1.1.jar"/>
        <classpathentry kind="lib" path="support/lib/external/log4j-1.2.9.jar"/>
        <classpathentry kind="lib" path="support/lib/test/asm-attrs.jar"/>
        <classpathentry kind="lib" path="support/lib/test/cglib-2.1.jar"/>
        <classpathentry kind="lib" path="support/lib/test/dbunit-2.1.jar"/>
-       <classpathentry kind="lib" path="support/lib/test/emma_ant-2.0.5312.jar"/>
-       <classpathentry kind="lib" path="support/lib/test/emma-2.0.5312.jar"/>
        <classpathentry kind="lib" path="support/lib/test/hibernate-3.0.5.jar"/>
-       <classpathentry kind="lib" path="support/lib/test/jdbc2_0-stdext.jar"/>
        <classpathentry kind="lib" path="support/lib/test/jmock-1.0.1.jar"/>
        <classpathentry kind="lib" path="support/lib/test/jmock-cglib-1.0.1.jar"/>
        <classpathentry kind="lib" path="support/lib/test/jta.jar"/>
index dd9e8ae71a5d94dc9856a27a253daaa4f4a3181b..baf9510b8c2ee9d560d1e888fbb58b8bd5b7b170 100644 (file)
@@ -19,7 +19,6 @@ package org.wamblee.crawler;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
-import java.io.PrintStream;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@@ -62,8 +61,6 @@ public abstract class AbstractPageRequest implements PageRequest {
 
     private String _xslt;
 
-    private PrintStream _os;
-
     /**
      * Constructs the request.
      * 
@@ -75,11 +72,9 @@ public abstract class AbstractPageRequest implements PageRequest {
      *            Request parameters to use.
      * @param aXslt
      *            XSLT used to convert the response.
-     * @param aOs
-     *            Output stream for logging (if null then no logging is done).
      */
     protected AbstractPageRequest(int aMaxTries, int aMaxDelay,
-            NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+            NameValuePair[] aParams, String aXslt) {
         if (aParams == null) {
             throw new IllegalArgumentException("aParams is null");
         }
@@ -90,7 +85,6 @@ public abstract class AbstractPageRequest implements PageRequest {
         _maxDelay = aMaxDelay;
         _params = aParams;
         _xslt = aXslt;
-        _os = aOs;
     }
 
     /*
@@ -163,14 +157,14 @@ public abstract class AbstractPageRequest implements PageRequest {
 
             Document transformed = new XSLT().transform(xhtmlData,
                     new FileResource(new File(_xslt)));
-            _os.println("Transformed result is: ");
+            ByteArrayOutputStream os = new ByteArrayOutputStream(); 
             Transformer transformer = TransformerFactory.newInstance()
                     .newTransformer();
             transformer.setParameter(OutputKeys.INDENT, "yes");
             transformer.setParameter(OutputKeys.METHOD, "xml");
             transformer.transform(new DOMSource(transformed), new StreamResult(
-                    _os));
-
+                    os));
+            LOG.debug("Transformed result is \n" + os.toString());
             return transformed;
         } catch (TransformerConfigurationException e) {
             throw new RuntimeException(e.getMessage(), e);
@@ -195,24 +189,21 @@ public abstract class AbstractPageRequest implements PageRequest {
         tidy.setXHTML(true);
         tidy.setQuiet(true);
         tidy.setShowWarnings(false);
-        if (_os != null) {
-            _os.println("Content of '" + aMethod.getURI() + "'");
-            _os.println();
-        }
+      
         // We write the jtidy output to XML since the DOM tree it produces is
         // not namespace aware and namespace awareness is required by XSLT.
         // An alternative is to configure namespace awareness of the XML parser
         // in a system wide way.
-        Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), _os);
+        ByteArrayOutputStream os = new ByteArrayOutputStream(); 
+        Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os);
         DOMUtility.removeDuplicateAttributes(w3cDoc);
+        LOG.debug("Content of response is \n" + os.toString()); 
 
         ByteArrayOutputStream xhtml = new ByteArrayOutputStream();
         XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat());
         serializer.serialize(w3cDoc);
         xhtml.flush();
-        if (_os != null) {
-            _os.println();
-        }
+
         return xhtml.toByteArray();
     }
 
index 2ce267ee828e72757efd2d5d22e0c7ea83dda963..1d92b024b78241629b8d748d23e4fbdea8b1d11b 100644 (file)
@@ -17,7 +17,6 @@
 package org.wamblee.crawler;
 
 import java.io.IOException;
-import java.io.PrintStream;
 
 import javax.xml.transform.TransformerException;
 
@@ -40,21 +39,9 @@ public class GetPageRequest extends AbstractPageRequest {
      * @param aXslt XSLT to use. 
      */
     public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
-        super(aMaxTries, aMaxDelay, aParams, aXslt, null);
+        super(aMaxTries, aMaxDelay, aParams, aXslt);
     }
-
-    /**
-     * Constructs the request.
-     * @param aMaxTries Maximum number of retries. 
-     * @param aMaxDelay Maximum delay before executing the request.
-     * @param aParams Request parameters to use. 
-     * @param aXslt XSLT to use.
-     * @param aOs Logging output stream to use.  
-     */
-    public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) {
-        super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
-    }
-
+    
     /*
      * (non-Javadoc)
      * 
index 2bb7dc91a88a7ae9c66cca685fffe3d54fd00dfb..db1f2eb9c9b93c4f632157469a681d9fde99a509 100644 (file)
@@ -17,7 +17,6 @@
 package org.wamblee.crawler;
 
 import java.io.IOException;
-import java.io.PrintStream;
 
 import javax.xml.transform.TransformerException;
 
@@ -39,20 +38,7 @@ public class PostPageRequest extends AbstractPageRequest {
      * @param aXslt XSLT to use.  
      */
     public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
-        super(aMaxTries, aMaxDelay, aParams, aXslt, null);
-    }
-
-    /**
-     * Constructs the request.
-     * @param aMaxTries Maximum number of retries. 
-     * @param aMaxDelay Maximum delay before executing the request.
-     * @param aParams Request parameters to use. 
-     * @param aXslt XSLT to use.
-     * @param aOs Logging output stream to use.  
-     */
-    public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt,
-            PrintStream aOs) {
-        super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
+        super(aMaxTries, aMaxDelay, aParams, aXslt);
     }
 
     /*
index 6795bf1c6c1fa1830d14e2902147f5d3816d31b4..679348517ead6824b354dfa7198307fcbe6dee16 100644 (file)
@@ -162,10 +162,10 @@ public class ConfigurationParser {
         PageRequest request;
         if (METHOD_POST.equals(method)) {
             request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray,
-                    xslt, _os);
+                    xslt);
         } else if (METHOD_GET.equals(method) || method == null) {
             request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray,
-                    xslt, _os);
+                    xslt);
         } else {
             throw new RuntimeException("Unknown request method '" + method
                     + "'. Only " + METHOD_GET + " and " + METHOD_POST
index 0b4b8e7a1c4e4b0a24625da5034db7f8e7727617..c0bf4e835bf6f9a6e1c12df0138c1031d9706fe4 100644 (file)
@@ -1,22 +1,26 @@
 <programs>
-  
   <program>
+    <category>horror</category>
     <action>notify</action>
     <match field="description">horror</match>
   </program>
   
   <program>
+    <category>films</category>
     <action>notify</action>
     <match field="keywords">film</match>
     <match field="description">horror|actie|thriller</match>
   </program>
   
   <program>
+    <category>science fiction</category>
     <action>notify</action>
     <match field="description">(sci-fi)|(science fiction)</match>
   </program>
   
   <program>
+    <category>documentaires</category>
     <action>notify</action>
     <match>(zembla)|(uur.*wolf)</match>
   </program>
@@ -46,6 +50,7 @@
   </program>
   
   <program>
+    <action>notify</action>
     <match>brainiac</match>
   </program>
   
index 7a0d421d294e50a777c9fe02d6186924960ffbe5..29d7c84940717d76711593e43c0e88272f011144 100644 (file)
@@ -27,8 +27,6 @@ import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
 import java.util.Properties;
-import java.util.Set;
-import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
index 2358180a405acb95e8f979746204b3886e3356c3..89c3c88ce1704612eda0dc5044abac95474f6349 100644 (file)
@@ -29,13 +29,25 @@ import org.wamblee.crawler.PageException;
  */
 public class Program {
     
+    /**
+     * Lexicographical comparison of programs based on (time, title, channel). 
+     *
+     */
     public static class TimeSorter implements Comparator<Program> { 
      
         /* (non-Javadoc)
          * @see java.util.Comparator#compare(T, T)
          */
         public int compare(Program o1, Program o2) { 
-            return o1.getInterval().getBegin().compareTo(o2.getInterval().getBegin());
+            int value = o1.getInterval().getBegin().compareTo(o2.getInterval().getBegin());
+            if ( value != 0 ) { 
+                return value; 
+            }
+            value = o1.getName().compareTo(o2.getName()); 
+            if (value != 0 ) { 
+                return value; 
+            }
+            return o1.getChannel().compareTo(o2.getChannel());
         }
     }
     
@@ -197,6 +209,9 @@ public class Program {
      */
     public RecordingResult record() {
         LOG.info("Recording " + this);
+        if ( SystemProperties.isRecordDisabled() ) { 
+            return RecordingResult.OK;
+        }
         try {
             Action record = _programInfo.execute().getAction(RECORD_ACTION);
             if (record == null) {
index 3865dcc8339f762107e177401cf5553643ba752b..8303449e5544799b5f7146f969054fcc2145340f 100644 (file)
@@ -16,9 +16,7 @@
 
 package org.wamblee.crawler.kiss;
 
-import java.util.ArrayList;
 import java.util.EnumMap;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
index 5d9f578d6e596d2912de5149017fcd6600b61588..e9ba272e3ec33fe25497c6637d87974af91075f8 100644 (file)
@@ -40,6 +40,8 @@ class ProgramConfigurationParser {
     private static final String ELEM_PATTERN = "match";
 
     private static final String ELEM_ACTION = "action";
+    
+    private static final String ELEM_CATEGORY = "category";
 
     private static final String ACTION_NOTIFY = "notify";
 
@@ -61,13 +63,20 @@ class ProgramConfigurationParser {
             for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext();) {
                 Element program = (Element) i.next();
 
+                Element categoryElem = program.element(ELEM_CATEGORY);
+                String category = "";
+                if ( categoryElem != null ) { 
+                    category = categoryElem.getText().trim(); 
+                }
+                
                 Element actionElem = program.element(ELEM_ACTION);
                 ProgramAction action = new RecordProgramAction();
                 if (actionElem != null) {
                     if (actionElem.getText().equals(ACTION_NOTIFY)) {
-                        action = new InterestingProgramAction("");
+                        action = new InterestingProgramAction(category);
                     }
                 }
+              
                 List<Condition<Program>> regexConditions = 
                     new ArrayList<Condition<Program>>();
                 for (Iterator j = program.elementIterator(ELEM_PATTERN); j.hasNext(); ) {
index 60bafa1454b679e54db83c72327ee664b87bc8c3..20af2a93a4e628f768dc843e3d0273b612ed6aa9 100644 (file)
@@ -23,6 +23,7 @@ public final class SystemProperties {
     
     private static final String DEBUG_PROPERTY = "kiss.debug";
     private static final String NO_PROGRAM_DETAILS = "kiss.nodetails";
+    private static final String DISABLE_RECORD = "kiss.norecord"; 
     
     /**
      * Disabled constructor. 
@@ -47,4 +48,14 @@ public final class SystemProperties {
     public static boolean isNoProgramDetailsRequired() { 
         return System.getProperties().getProperty(NO_PROGRAM_DETAILS) != null; 
     }
+    
+
+    /**
+     * Determines if recording is disabled. 
+     * @return True iff no recording should be done.  
+     */
+    public static boolean isRecordDisabled() { 
+        return System.getProperties().getProperty(DISABLE_RECORD) != null; 
+    }
+    
 }