From: erik Date: Thu, 16 Mar 2006 21:53:22 +0000 (+0000) Subject: (no commit message) X-Git-Tag: wamblee-utils-0.2@603~559 X-Git-Url: http://wamblee.org/gitweb/?a=commitdiff_plain;h=8470878078393cdd8bbf85c2b87910e3ab97cbd1;p=utils --- diff --git a/trunk/crawler/basic/build.xml b/trunk/crawler/basic/build.xml index a88dbabc..ac44f846 100644 --- a/trunk/crawler/basic/build.xml +++ b/trunk/crawler/basic/build.xml @@ -12,7 +12,7 @@ - + &header; diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 73132aa8..30477204 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -49,127 +49,173 @@ import org.wamblee.xml.XSLT; public abstract class AbstractPageRequest implements PageRequest { private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class); + private static final String REDIRECT_HEADER = "Location"; - - private int _maxTries; - private int _maxDelay; + + private int _maxTries; + + private int _maxDelay; private NameValuePair[] _params; private String _xslt; - - private PrintStream _os; - protected AbstractPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) { - if ( aParams == null ) { + private PrintStream _os; + + /** + * Constructs the request. + * @param aMaxTries Maximum retries to perform. + * @param aMaxDelay Maximum delay before executing a request. + * @param aParams Request parameters to use. + * @param aXslt XSLT used to convert the response. + * @param aOs Output stream for logging (if null then no logging is done). + */ + protected AbstractPageRequest(int aMaxTries, int aMaxDelay, + NameValuePair[] aParams, String aXslt, PrintStream aOs) { + if (aParams == null) { throw new IllegalArgumentException("aParams is null"); } - if ( aXslt == null ) { + if (aXslt == null) { throw new IllegalArgumentException("aXslt is null"); } _maxTries = aMaxTries; _maxDelay = aMaxDelay; _params = aParams; _xslt = aXslt; - _os = aOs; + _os = aOs; } - - /* (non-Javadoc) + + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.PageRequest#overrideXslt(java.lang.String) */ public void overrideXslt(String aXslt) { - _xslt = aXslt; + _xslt = aXslt; } - - protected NameValuePair[] getParameters() { + + /** + * Gets the parameters for the request. + * @return Request parameters. + */ + protected NameValuePair[] getParameters() { return _params; } - - protected Document executeMethod(HttpClient client, HttpMethod method) throws TransformerException { - int triesLeft = _maxTries; - while ( triesLeft > 0 ) { + + /** + * Executes the request with a random delay and with a maximum number of + * retries. + * @param aClient HTTP client to use. + * @param aMethod Method representing the request. + * @return XML document describing the response. + * @throws TransformerException In case transformation of the HTML to XML fails. + */ + protected Document executeMethod(HttpClient aClient, HttpMethod aMethod) + throws TransformerException { + int triesLeft = _maxTries; + while (triesLeft > 0) { triesLeft--; - try { - return executeMethodWithoutRetries(client, method); - } catch (TransformerException e) { - if ( triesLeft == 0 ) { + try { + return executeMethodWithoutRetries(aClient, aMethod); + } catch (TransformerException e) { + if (triesLeft == 0) { throw e; } } } throw new RuntimeException("Code should never reach this point"); } - - protected Document executeMethodWithoutRetries(HttpClient client, HttpMethod method) throws TransformerException { + /** + * Executes the request without doing any retries in case XSLT transformation + * fails. + * @param aClient HTTP client to use. + * @param aMethod Method to execute. + * @return XML document containing the result. + * @throws TransformerException In case transformation of the result to XML fails. + */ + protected Document executeMethodWithoutRetries(HttpClient aClient, + HttpMethod aMethod) throws TransformerException { try { // Execute the method. - method = executeWithRedirects(client, method); + aMethod = executeWithRedirects(aClient, aMethod); // Transform the HTML into wellformed XML. Tidy tidy = new Tidy(); tidy.setXHTML(true); - tidy.setQuiet(true); + tidy.setQuiet(true); tidy.setShowWarnings(false); - if ( _os != null ) { - _os.println("Content of '" + method.getURI() + "'"); + if (_os != null) { + _os.println("Content of '" + aMethod.getURI() + "'"); _os.println(); } - // We let jtidy produce raw output because the DOM it produces is - // is not namespace aware. We let the XSLT processor parse the XML again - // to ensure that the XSLT uses a namespace aware DOM tree. An alternative - // is to configure namespace awareness of the XML parser in a system wide way. + // We let jtidy produce raw output because the DOM it produces is + // is not namespace aware. We let the XSLT processor parse the XML + // again + // to ensure that the XSLT uses a namespace aware DOM tree. An + // alternative + // is to configure namespace awareness of the XML parser in a system + // wide way. ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); - tidy.parse(method.getResponseBodyAsStream(), xhtml); + tidy.parse(aMethod.getResponseBodyAsStream(), xhtml); _os.print(new String(xhtml.toByteArray())); - // Obtaining the XML as dom is not used. - //Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(), - // _os); - if ( _os != null ) { + // Obtaining the XML as dom is not used. + // Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(), + // _os); + if (_os != null) { _os.println(); } xhtml.flush(); byte[] xhtmlData = xhtml.toByteArray(); - Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt))); + Document transformed = new XSLT().transform(xhtmlData, + new FileResource(new File(_xslt))); _os.println("Transformed result is: "); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); transformer.setParameter(OutputKeys.INDENT, "yes"); transformer.setParameter(OutputKeys.METHOD, "xml"); - transformer.transform(new DOMSource(transformed), new StreamResult(_os)); - + transformer.transform(new DOMSource(transformed), new StreamResult( + _os)); + return transformed; - } catch (HttpException e) { - throw new RuntimeException(e.getMessage(), e); - } catch (IOException e) { + } catch (HttpException e) { + throw new RuntimeException(e.getMessage(), e); + } catch (IOException e) { throw new RuntimeException(e.getMessage(), e); - } catch (TransformerConfigurationException e) { + } catch (TransformerConfigurationException e) { throw new RuntimeException(e.getMessage(), e); } finally { // Release the connection. - method.releaseConnection(); + aMethod.releaseConnection(); } } - - private void delay() { + + /** + * Sleeps for a random time but no more than the maximum delay. + * + */ + private void delay() { try { - Thread.sleep((long)((float)_maxDelay* Math.random())); - } catch (InterruptedException e) { - // + Thread.sleep((long) ((float) _maxDelay * Math.random())); + } catch (InterruptedException e) { + return; // to satisfy checkstyle } } - /** - * @param aClient - * @param aMethod - * @throws IOException - * @throws HttpException + * Executes the request and follows redirects if needed. + * @param aClient HTTP client to use. + * @param aMethod Method to use. + * @return Final HTTP method used (differs from the parameter passed in in case + * of redirection). + * @throws IOException In case of network problems. */ - private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException { + private HttpMethod executeWithRedirects(HttpClient aClient, + HttpMethod aMethod) throws IOException { delay(); int statusCode = aClient.executeMethod(aMethod); - switch (statusCode) { + switch (statusCode) { case HttpStatus.SC_OK: { return aMethod; } @@ -179,7 +225,9 @@ public abstract class AbstractPageRequest implements PageRequest { aMethod.releaseConnection(); Header header = aMethod.getResponseHeader(REDIRECT_HEADER); aMethod = new GetMethod(header.getValue()); - return executeWithRedirects(aClient, aMethod); // TODO protect against infinite recursion. + return executeWithRedirects(aClient, aMethod); // TODO protect + // against infinite + // recursion. } default: { throw new RuntimeException("Method failed: " diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/Action.java b/trunk/crawler/basic/src/org/wamblee/crawler/Action.java index 0002814d..cd9b4e2a 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/Action.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/Action.java @@ -31,14 +31,15 @@ public interface Action { /** * Executes the action. - * @return + * @return New page as a result of the action. + * @throws PageException In case of an error obtaining the page. */ Page execute() throws PageException; /** * Gets a description of the action. THe element returned is the action element * itself. - * @return + * @return Content as XML. */ Element getContent(); } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/Configuration.java b/trunk/crawler/basic/src/org/wamblee/crawler/Configuration.java index 662182ca..93372b83 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/Configuration.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/Configuration.java @@ -24,7 +24,7 @@ public interface Configuration { /** * Gets the page request based on the URL. - * @param aUrl + * @param aUrl Url of the page to retrieve. * @return Page request. */ PageRequest getRequest(String aUrl); diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java b/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java index 07dff3fd..201df3fb 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java @@ -26,6 +26,7 @@ public interface Crawler { * Gets the content for a specific page. * @param aUrl Url of page. * @return Page to retrieve. + * @throws PageException In case of problems retrieving the page. */ Page getPage(String aUrl) throws PageException; @@ -33,7 +34,8 @@ public interface Crawler { * Gets the content for a specific page. * @param aUrl Url of page. * @param aType Type of page. - * @return Page. + * @return Page. + * @throws PageException In case of problems retrieving the page. */ Page getPage(String aUrl, PageType aType) throws PageException; } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java index 9a9d02e4..3da77b83 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java @@ -31,10 +31,25 @@ import org.w3c.dom.Document; */ public class GetPageRequest extends AbstractPageRequest { + /** + * Constructs the request. + * @param aMaxTries Maximum number of retries. + * @param aMaxDelay Maximum delay before executing the request. + * @param aParams Request parameters to use. + * @param aXslt XSLT to use. + */ public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) { super(aMaxTries, aMaxDelay, aParams, aXslt, null); } + /** + * Constructs the request. + * @param aMaxTries Maximum number of retries. + * @param aMaxDelay Maximum delay before executing the request. + * @param aParams Request parameters to use. + * @param aXslt XSLT to use. + * @param aOs Logging output stream to use. + */ public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) { super(aMaxTries, aMaxDelay, aParams, aXslt, aOs); } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/PageException.java b/trunk/crawler/basic/src/org/wamblee/crawler/PageException.java index a48d3d5f..22a1e1bb 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/PageException.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/PageException.java @@ -17,14 +17,24 @@ package org.wamblee.crawler; /** - * + * Exception thrown when there is a problem in retrieving or transforming the + * page. */ public class PageException extends Exception { + /** + * Constructs the exception. + * @param aMsg Message. + */ public PageException(String aMsg) { super(aMsg); } + /** + * Constructs the exception. + * @param aMsg Message. + * @param aCause Cause of the exception. + */ public PageException(String aMsg, Throwable aCause) { super(aMsg, aCause); } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java index 753bc05b..192f74e4 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java @@ -25,14 +25,17 @@ import org.w3c.dom.Document; public interface PageRequest { /** - * Gets a page as an XML document. + * Gets a page as an XML document. + * @param aUrl Url of the page. * @param aClient Http client to use. * @return Client. + * @throws PageException In case of problems retrieving the page. */ Document execute(String aUrl, HttpClient aClient) throws PageException; /** - * Overrides the Xslt to use. + * Overrides the Xslt to use. This is used when the transformed page specifies + * the page type explicitly for an action. * @param aXslt Xslt to use. */ void overrideXslt(String aXslt); diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/PageType.java b/trunk/crawler/basic/src/org/wamblee/crawler/PageType.java index 9d2af30b..c23aa087 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/PageType.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/PageType.java @@ -17,16 +17,28 @@ package org.wamblee.crawler; /** - * + * Represents the type of a page determining how the HTML should be transformed into + * XML. */ public class PageType { + /** + * Type string. + */ private String _type; + /** + * Constructs the type. + * @param aType Type. + */ public PageType(String aType) { _type = aType; } + /** + * Gets the type. + * @return Type. + */ public String getType() { return _type; } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java index 22576522..5fbc0e31 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java @@ -30,10 +30,25 @@ import org.w3c.dom.Document; */ public class PostPageRequest extends AbstractPageRequest { + /** + * Constructs the request. + * @param aMaxTries Maximum number of retries. + * @param aMaxDelay Maximum delay before executing the request. + * @param aParams Request parameters to use. + * @param aXslt XSLT to use. + */ public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) { super(aMaxTries, aMaxDelay, aParams, aXslt, null); } + /** + * Constructs the request. + * @param aMaxTries Maximum number of retries. + * @param aMaxDelay Maximum delay before executing the request. + * @param aParams Request parameters to use. + * @param aXslt XSLT to use. + * @param aOs Logging output stream to use. + */ public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) { super(aMaxTries, aMaxDelay, aParams, aXslt, aOs); diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java index 15e740a6..65c7f802 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java @@ -35,15 +35,23 @@ import org.wamblee.crawler.PageException; /** * Entry point for the crawler. */ -public class App { +public final class App { + + /** + * Disabled constructor. + * + */ + private App() { + // Empty + } private static final Log LOG = LogFactory.getLog(App.class); private static final String LOG_FILE = "crawler.log"; - public static void main(String[] args) throws Exception { - String configFileName = args[0]; - String starturl = args[1]; + public static void main(String[] aArgs) throws Exception { + String configFileName = aArgs[0]; + String starturl = aArgs[1]; FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); PrintStream os = new PrintStream(fos); diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java index 7dfd9169..d6c73859 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.impl; @@ -22,18 +22,19 @@ import java.util.regex.Pattern; * */ class ConfigItem { - + private Pattern _pattern; - private ValueType _value; - - protected ConfigItem(String aPattern, ValueType aValue) { + + private ValueType _value; + + protected ConfigItem(String aPattern, ValueType aValue) { _pattern = Pattern.compile(aPattern); _value = aValue; } - - protected ValueType match(String aValue) { - if ( !_pattern.matcher(aValue).matches() ) { - return null; + + protected ValueType match(String aValue) { + if (!_pattern.matcher(aValue).matches()) { + return null; } return _value; } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java index 54874559..4a64c6e5 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.impl; @@ -22,44 +22,51 @@ import org.wamblee.crawler.Configuration; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; - /** - * Implementation of the configuration for the crawler. + * Implementation of the configuration for the crawler. */ public class ConfigurationImpl implements Configuration { - + private List _urlConfig; - private List _pageTypeConfig; - - public ConfigurationImpl(List aUrlConfig, List aPageTypeConfig) { - _urlConfig = aUrlConfig; - _pageTypeConfig = aPageTypeConfig; + + private List _pageTypeConfig; + + public ConfigurationImpl(List aUrlConfig, + List aPageTypeConfig) { + _urlConfig = aUrlConfig; + _pageTypeConfig = aPageTypeConfig; } - /* (non-Javadoc) + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.Configuration#getRequest(java.lang.String) */ public PageRequest getRequest(String aUrl) { - - for (UrlConfig config: _urlConfig) { - PageRequest request = config.getRequest(aUrl); - if ( request != null ) { - return request; + + for (UrlConfig config : _urlConfig) { + PageRequest request = config.getRequest(aUrl); + if (request != null) { + return request; } } - throw new RuntimeException("No configuration matched the URL '" + aUrl + "'"); + throw new RuntimeException("No configuration matched the URL '" + aUrl + + "'"); } - - /* (non-Javadoc) + + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.Configuration#getRequest(org.wamblee.crawler.PageType) */ public PageRequest getRequest(PageType aType) { - for (PageTypeConfig config: _pageTypeConfig) { - PageRequest request = config.getRequest(aType.getType()); - if ( request != null ) { - return request; + for (PageTypeConfig config : _pageTypeConfig) { + PageRequest request = config.getRequest(aType.getType()); + if (request != null) { + return request; } } - throw new RuntimeException("No configuration matched type '" + aType + "'"); + throw new RuntimeException("No configuration matched type '" + aType + + "'"); } } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java index dafbc832..2f2e5f5b 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -36,24 +36,33 @@ import org.wamblee.crawler.PostPageRequest; * Parsing of the configuration from an XML file. */ public class ConfigurationParser { - - private static final String ELEM_URL = "url"; + + private static final String ELEM_URL = "url"; + private static final String ELEM_TYPE = "type"; - private static final String ELEM_PATTERN = "pattern"; - private static final String ELEM_METHOD= "method"; - private static final String ELEM_XSLT = "xslt"; - private static final String ELEM_PARAM = "param"; + + private static final String ELEM_PATTERN = "pattern"; + + private static final String ELEM_METHOD = "method"; + + private static final String ELEM_XSLT = "xslt"; + + private static final String ELEM_PARAM = "param"; + private static final String AT_NAME = "name"; + private static final String AT_VALUE = "value"; - + private static final String METHOD_POST = "post"; + private static final String METHOD_GET = "get"; - - private static final int MAX_TRIES = 3; + + private static final int MAX_TRIES = 3; + private static final int MAX_DELAY = 5000; - - private PrintStream _os; - + + private PrintStream _os; + public ConfigurationParser(PrintStream aOs) { _os = aOs; } @@ -62,10 +71,10 @@ public class ConfigurationParser { try { SAXReader reader = new SAXReader(); Document document = reader.read(aStream); - - Element root = document.getRootElement(); + + Element root = document.getRootElement(); List urlConfigs = parseUrlConfigs(root); - List pageTypeConfigs = parsePageTypeConfigs(root); + List pageTypeConfigs = parsePageTypeConfigs(root); return new ConfigurationImpl(urlConfigs, pageTypeConfigs); } catch (DocumentException e) { throw new RuntimeException("Problem parsing config file", e); @@ -73,36 +82,36 @@ public class ConfigurationParser { } /** - * @param root + * @param aRoot * @return */ - private List parseUrlConfigs(Element root) { + private List parseUrlConfigs(Element aRoot) { List configs = new ArrayList(); - for (Iterator i = root.elementIterator(ELEM_URL); i.hasNext(); ) { - Element url = (Element)i.next(); + for (Iterator i = aRoot.elementIterator(ELEM_URL); i.hasNext();) { + Element url = (Element) i.next(); UrlConfig config = parseUrlConfig(url); configs.add(config); } return configs; } - - private List parsePageTypeConfigs(Element root) { + + private List parsePageTypeConfigs(Element aRoot) { List configs = new ArrayList(); - for (Iterator i = root.elementIterator(ELEM_TYPE); i.hasNext(); ) { - Element url = (Element)i.next(); + for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) { + Element url = (Element) i.next(); PageTypeConfig config = parsePageTypeConfig(url); configs.add(config); } return configs; } - - private UrlConfig parseUrlConfig(Element aUrlElem) { + + private UrlConfig parseUrlConfig(Element aUrlElem) { String pattern = aUrlElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aUrlElem); return new UrlConfig(pattern, request); } - - private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { + + private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { String pattern = aTypeElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aTypeElem); return new PageTypeConfig(pattern, request); @@ -113,30 +122,32 @@ public class ConfigurationParser { * @return */ private PageRequest parseRequestConfig(Element aUrlElem) { - String method = aUrlElem.elementText(ELEM_METHOD); + String method = aUrlElem.elementText(ELEM_METHOD); String xslt = aUrlElem.elementText(ELEM_XSLT); List params = new ArrayList(); - for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext(); ) { - Element paramElem = (Element)i.next(); + for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext();) { + Element paramElem = (Element) i.next(); NameValuePair param = parseParameter(paramElem); params.add(param); } - + NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); - PageRequest request; - if ( METHOD_POST.equals(method)) { - request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os); - } - else if ( METHOD_GET.equals(method) || method == null ){ - request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os); - } else { - throw new RuntimeException("Unknown request method '" + method + "'. Only " + - METHOD_GET + " and " + METHOD_POST + " are supported"); + PageRequest request; + if (METHOD_POST.equals(method)) { + request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, + xslt, _os); + } else if (METHOD_GET.equals(method) || method == null) { + request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, + xslt, _os); + } else { + throw new RuntimeException("Unknown request method '" + method + + "'. Only " + METHOD_GET + " and " + METHOD_POST + + " are supported"); } return request; } - - private NameValuePair parseParameter(Element aParam) { + + private NameValuePair parseParameter(Element aParam) { String name = aParam.attributeValue(AT_NAME); String value = aParam.attributeValue(AT_VALUE); return new NameValuePair(name, value); diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java index 90512222..e1e61d8d 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java @@ -43,8 +43,8 @@ public class PageImpl implements Page { private Crawler _crawler; private Element _content; - - private Action[] _actions; + + private Action[] _actions; /** * Constructs a page. @@ -56,7 +56,7 @@ public class PageImpl implements Page { _content = aContent; _actions = computeActions(); } - + /* * (non-Javadoc) * @@ -70,11 +70,11 @@ public class PageImpl implements Page { String name = elem.attributeValue(ATT_NAME); String href = elem.attributeValue(ATT_HREF); String type = elem.attributeValue(ATT_TYPE); - if (type == null ) { + if (type == null) { names.add(new ActionImpl(_crawler, elem, name, href)); - } - else { - names.add(new ActionImpl(_crawler, elem, name, href, new PageType(type))); + } else { + names.add(new ActionImpl(_crawler, elem, name, href, + new PageType(type))); } } return names.toArray(new Action[0]); @@ -89,21 +89,24 @@ public class PageImpl implements Page { return _content; } - /* (non-Javadoc) + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.Page#getActions() */ public Action[] getActions() { return _actions; } - + /* - * (non-Javadoc) + * (non-Javadoc) + * * @see org.wamblee.crawler.Page#getAction(java.lang.String) */ public Action getAction(String aName) { List results = new ArrayList(); - for (Action action: _actions) { - if ( action.getName().equals(aName)) { + for (Action action : _actions) { + if (action.getName().equals(aName)) { results.add(action); } } diff --git a/trunk/crawler/basic/src/Main.java b/trunk/crawler/basic/test/org/wamblee/crawler/Main.java similarity index 82% rename from trunk/crawler/basic/src/Main.java rename to trunk/crawler/basic/test/org/wamblee/crawler/Main.java index 8287a6ec..6d157d51 100644 --- a/trunk/crawler/basic/src/Main.java +++ b/trunk/crawler/basic/test/org/wamblee/crawler/Main.java @@ -1,3 +1,4 @@ +package org.wamblee.crawler; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -33,18 +34,36 @@ import org.w3c.tidy.Tidy; /** * */ -public class Main { +public final class Main { + + /** + * + */ + private static final int PROXY_PORT = 10000; + + /** + * + */ + private static final int MAX_REDIRECTS = 10; + + /** + * Disabled constructor. + * + */ + private Main() { + // Empty + } private static final String BASE = "http://epg.kml.kiss-technology.com/"; - private static int count = 0; + private static int COUNT = 0; public static void main(String[] aArgs) { HttpClientParams clientParams = new HttpClientParams(); - clientParams.setIntParameter(HttpClientParams.MAX_REDIRECTS, 10); + clientParams.setIntParameter(HttpClientParams.MAX_REDIRECTS, MAX_REDIRECTS); clientParams.setBooleanParameter(HttpClientParams.REJECT_RELATIVE_REDIRECT, false); HttpClient client = new HttpClient(clientParams); - client.getHostConfiguration().setProxy("localhost", 10000); + client.getHostConfiguration().setProxy("localhost", PROXY_PORT); clientParams = client.getParams(); Object obj = clientParams.getParameter(HttpClientParams.MAX_REDIRECTS); @@ -81,27 +100,27 @@ public class Main { } /** - * @param client - * @param method + * @param aClient + * @param aMethod */ - private static int executeMethod(HttpClient client, HttpMethod method) { + private static int executeMethod(HttpClient aClient, HttpMethod aMethod) { //method.setFollowRedirects(true); try { // Execute the method. - int statusCode = client.executeMethod(method); + int statusCode = aClient.executeMethod(aMethod); if (statusCode != HttpStatus.SC_OK) { - System.err.println("Method failed: " + method.getStatusLine()); + System.err.println("Method failed: " + aMethod.getStatusLine()); } // Read the response body. - String filename = "output" + count++; + String filename = "output" + COUNT++; FileOutputStream os = new FileOutputStream(new File(filename)); //os.write(method.getResponseBody()); Tidy tidy = new Tidy(); tidy.setXHTML(true); - tidy.parse(method.getResponseBodyAsStream(), os); + tidy.parse(aMethod.getResponseBodyAsStream(), os); os.close(); System.out.println("Written response to file: " + filename); return statusCode; @@ -111,7 +130,7 @@ public class Main { throw new RuntimeException("Fatal transport error: " + e.getMessage()); } finally { // Release the connection. - method.releaseConnection(); + aMethod.releaseConnection(); } } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/AbstractVisitor.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/AbstractVisitor.java index 266878b6..c63487e1 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/AbstractVisitor.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/AbstractVisitor.java @@ -19,16 +19,21 @@ package org.wamblee.crawler.kiss; import java.util.List; /** - * + * Abstract visitor of the tv guide with default looping behavior. */ public abstract class AbstractVisitor implements Visitor { + /** + * Constructs the visitor. + * + */ protected AbstractVisitor() { // Empty } - /* (non-Javadoc) - * @see org.wamblee.crawler.kiss.Visitor#visitChannel(org.wamblee.crawler.kiss.Channel) + /** + * Visits the channel by visiting all programs of the channel. + * @param aChannel Channel to visit. */ public void visitChannel(Channel aChannel) { List programs = aChannel.getPrograms(); @@ -37,8 +42,9 @@ public abstract class AbstractVisitor implements Visitor { } } - /* (non-Javadoc) - * @see org.wamblee.crawler.kiss.Visitor#visitTvGuide(org.wamblee.crawler.kiss.TVGuide) + /** + * Visits the TV guide by visiting all channels of the guide. + * @param aGuide TV guide to visit. */ public void visitTvGuide(TVGuide aGuide) { List channels = aGuide.getChannels(); diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Channel.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Channel.java index 140a3d1c..a59ed08d 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Channel.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Channel.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.kiss; @@ -20,27 +20,51 @@ import java.util.Collections; import java.util.List; /** - * + * Represents the programme for a tv channel. */ public class Channel { - + + /** + * TV channel name. + */ private String _name; - private List _programs; - - public Channel(String aName, List aPrograms) { - _name = aName; - _programs = aPrograms; + + /** + * List of programs in chronological order. + */ + private List _programs; + + /** + * Constructs the channel. + * @param aName Channel name. + * @param aPrograms Programs. + */ + public Channel(String aName, List aPrograms) { + _name = aName; + _programs = aPrograms; } - - public String getName() { - return _name; + + /** + * Gets the channel name. + * @return channel name. + */ + public String getName() { + return _name; } - - public List getPrograms() { + + /** + * Gets the list of program. + * @return Programs. + */ + public List getPrograms() { return Collections.unmodifiableList(_programs); } - - public void accept(Visitor aVisitor) { + + /** + * Accepts a visitor. + * @param aVisitor Visitor. + */ + public void accept(Visitor aVisitor) { aVisitor.visitChannel(this); } } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java index 298e9443..5ada1009 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java @@ -34,7 +34,6 @@ import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.Transport; -import javax.mail.internet.AddressException; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; @@ -52,27 +51,65 @@ import org.wamblee.crawler.impl.ConfigurationParser; import org.wamblee.crawler.impl.CrawlerImpl; /** + * The KiSS crawler for automatic recording of interesting TV shows. * */ public class KissCrawler { private static final Log LOG = LogFactory.getLog(KissCrawler.class); + /** + * Log file name for the crawler. + */ private static final String LOG_FILE = "kiss.log"; + /** + * Start URL of the electronic programme guide. + */ private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php"; + /** + * Crawler configuration file. + */ private static final String CRAWLER_CONFIG = "config.xml"; + /** + * Configuration file describing interesting programs. + */ private static final String PROGRAM_CONFIG = "programs.xml"; + /** + * Regular expression for matching time interval strings in the + * retrieved pages. + */ private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*"; + /** + * Compiled pattern for the time regular expression. + */ private Pattern _pattern; + /** + * Runs the KiSS crawler. + * @param aArgs Arguments, currently all ignored because they are hardcoded. + * @throws Exception In case of problems. + */ + public static void main(String[] aArgs) throws Exception { + new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); + } + + /** + * Constructs the crawler. This retrieves the TV guide by crawling the + * KiSS EPG guide, filters the guide for interesting programs, tries to + * record them, and sends a summary mail to the user. + * @param aStartUrl Start URL of the electronic programme guide. + * @param aCrawlerConfig Configuration file for the crawler. + * @param aProgramConfig Configuration file describing interesting shows. + * @throws IOException In case of problems reading files. + * @throws MessagingException In case of problems sending a mail notification. + */ public KissCrawler(String aStartUrl, String aCrawlerConfig, - String aProgramConfig) throws IOException, AddressException, - MessagingException { + String aProgramConfig) throws IOException, MessagingException { _pattern = Pattern.compile(TIME_REGEX); @@ -103,15 +140,15 @@ public class KissCrawler { } /** - * @param programCondition - * @param guide - * @throws AddressException - * @throws MessagingException + * Records interesting shows. + * @param aProgramCondition Condition determining which shows are interesting. + * @param aGuide Television guide. + * @throws MessagingException In case of problems sending a summary mail. */ - private void recordInterestingShows(Condition programCondition, - TVGuide guide) throws AddressException, MessagingException { - MatchVisitor matcher = new MatchVisitor(programCondition); - guide.accept(matcher); + private void recordInterestingShows(Condition aProgramCondition, + TVGuide aGuide) throws MessagingException { + MatchVisitor matcher = new MatchVisitor(aProgramCondition); + aGuide.accept(matcher); List programs = matcher.getMatches(); String recorded = ""; String notRecorded = ""; @@ -148,30 +185,33 @@ public class KissCrawler { } /** - * @param aCrawlerConfig - * @param os - * @param client - * @return - * @throws FileNotFoundException + * Creates the crawler. + * @param aCrawlerConfig Crawler configuration file. + * @param aOs Logging output stream for the crawler. + * @param aClient HTTP Client to use. + * @return Crawler. + * @throws FileNotFoundException In case configuration files cannot be found. */ - private Crawler createCrawler(String aCrawlerConfig, PrintStream os, - HttpClient client) throws FileNotFoundException { - ConfigurationParser parser = new ConfigurationParser(os); + private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs, + HttpClient aClient) throws FileNotFoundException { + ConfigurationParser parser = new ConfigurationParser(aOs); InputStream crawlerConfigFile = new FileInputStream(new File( aCrawlerConfig)); Configuration config = parser.parse(crawlerConfigFile); - Crawler crawler = new CrawlerImpl(client, config); + Crawler crawler = new CrawlerImpl(aClient, config); return crawler; } /** - * @param aStartUrl - * @param crawler - * @return + * Gets the start page of the electronic programme guide. This involves login and + * navigation to a suitable start page after logging in. + * @param aStartUrl URL of the electronic programme guide. + * @param aCrawler Crawler to use. + * @return Starting page. */ - private Page getStartPage(String aStartUrl, Crawler crawler) { + private Page getStartPage(String aStartUrl, Crawler aCrawler) { try { - Page page = crawler.getPage(aStartUrl); + Page page = aCrawler.getPage(aStartUrl); return page.getAction("channels-favorites").execute(); } catch (PageException e) { throw new RuntimeException( @@ -179,22 +219,14 @@ public class KissCrawler { } } - public static void main(String[] args) throws Exception { - new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG); - } - - private void showPage(Page aPage) { - Action[] links = aPage.getActions(); - for (Action link : links) { - System.out.println("Link found '" + link.getName() + "'"); - } - Element element = aPage.getContent(); - System.out.println("Retrieved content: " + element.asXML()); - } - - private TVGuide createGuide(Page page) { + /** + * Creates the TV guide by web crawling. + * @param aPage Starting page. + * @return TV guide. + */ + private TVGuide createGuide(Page aPage) { LOG.info("Obtaining full TV guide"); - Action[] actions = page.getActions(); + Action[] actions = aPage.getActions(); List channels = new ArrayList(); for (Action action : actions) { try { @@ -210,6 +242,12 @@ public class KissCrawler { return new TVGuide(channels); } + /** + * Create channel information for a specific channel. + * @param aChannel Channel name. + * @param aPage Starting page for the channel. + * @return Channel. + */ private Channel createChannel(String aChannel, Page aPage) { LOG.info("Obtaining program for " + aChannel); Action[] programActions = aPage.getActions(); @@ -240,8 +278,12 @@ public class KissCrawler { return new Channel(aChannel, programs); } - private void sendMail(String aText) throws AddressException, - MessagingException { + /** + * Sends a summary mail to the user. + * @param aText Text of the mail. + * @throws MessagingException In case of problems sending mail. + */ + private void sendMail(String aText) throws MessagingException { Properties props = new Properties(); props.put("mail.transport.protocol", "smtp"); props.put("mail.smtp.host", "falcon"); diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/MatchVisitor.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/MatchVisitor.java index a0574568..92e9b89a 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/MatchVisitor.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/MatchVisitor.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.kiss; @@ -22,30 +22,47 @@ import java.util.List; import org.wamblee.conditions.Condition; /** - * + * Visitor which determines the interesting programs in the TV guide. */ public class MatchVisitor extends AbstractVisitor { - - private Condition _matcher; + + /** + * Criterion that determines which programs are interesting. + */ + private Condition _matcher; + + /** + * List of interesting programs. + */ private List _programs; - - public MatchVisitor(Condition aMatcher) { - _matcher = aMatcher; + + /** + * Constructs the visitor. + * @param aMatcher Condition describing interesting programs. + */ + public MatchVisitor(Condition aMatcher) { + _matcher = aMatcher; _programs = new ArrayList(); } - - /* (non-Javadoc) + + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.kiss.Visitor#visitProgram(org.wamblee.crawler.kiss.Program) */ public void visitProgram(Program aProgram) { - if ( _matcher.matches(aProgram)) { + if (_matcher.matches(aProgram)) { _programs.add(aProgram); } } - - public List getMatches() { - return _programs; + + /** + * Gets the list of interesting programs. To be called after applying + * the visitor on a tv guide. + * @return List of interesting programs. + */ + public List getMatches() { + return _programs; } - } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/PrintVisitor.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/PrintVisitor.java index 387782b3..0764426b 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/PrintVisitor.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/PrintVisitor.java @@ -19,12 +19,19 @@ package org.wamblee.crawler.kiss; import java.io.PrintStream; /** - * + * Print visitor for pretty printing the TV guide. */ public class PrintVisitor extends AbstractVisitor { + /** + * Stream to print the guide on. + */ private PrintStream _stream; + /** + * Constructs the print visitor. + * @param aStream Stream to print on. + */ public PrintVisitor(PrintStream aStream) { _stream = aStream; } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java index 2308fa98..12f51db9 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Program.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.kiss; @@ -20,68 +20,141 @@ import org.wamblee.crawler.Action; import org.wamblee.crawler.PageException; /** - * + * Represents a television program. */ public class Program { - + + /** + * Name of the record action on the program details page. + */ private static final String RECORD_ACTION = "record"; + + /** + * Indent string to use for pretty printing. + */ private static final String INDENT = " "; - - private String _channel; + + /** + * Channel the program is on. + */ + private String _channel; + + /** + * Program name. + */ private String _name; - private String _description; + + /** + * Program description. + */ + private String _description; + + /** + * Keywords or classification of the program. + */ private String _keywords; + + /** + * Time interval for the program (from/to). + */ private TimeInterval _interval; - private Action _programInfo; - - public Program(String aChannel, String aName, String aDescription, String aKeywords, TimeInterval aInterval, Action aProgramInfo) { - _channel = aChannel; - _name = aName; + + /** + * Action to execute to obtain program information and/or record the program. + */ + private Action _programInfo; + + /** + * Constructs the program. + * @param aChannel Channel name. + * @param aName Program name. + * @param aDescription Description. + * @param aKeywords Keywords/classification. + * @param aInterval Time interval. + * @param aProgramInfo Action to execute for detailed program information or + * for recording the page. + */ + public Program(String aChannel, String aName, String aDescription, + String aKeywords, TimeInterval aInterval, Action aProgramInfo) { + _channel = aChannel; + _name = aName; _description = aDescription; - _keywords = aKeywords; + _keywords = aKeywords; _interval = aInterval; - _programInfo = aProgramInfo; + _programInfo = aProgramInfo; } - - public String getChannel() { - return _channel; + + /** + * Gets the channel. + * @return Channel. + */ + public String getChannel() { + return _channel; } - - public String getName() { - return _name; + + /** + * Gets the program name. + * @return Name. + */ + public String getName() { + return _name; } - - public String getDescription() { + + /** + * Gets the description. + * @return Description. + */ + public String getDescription() { return _description; } - - public String getKeywords() { - return _keywords; + + /** + * Gets the keywords/classification. + * @return Keywords/classification + */ + public String getKeywords() { + return _keywords; } - - public TimeInterval getInterval() { - return _interval; + + /** + * Gets the time interval. + * @return Time interval. + */ + public TimeInterval getInterval() { + return _interval; } - - public boolean record() throws PageException { - Action record = _programInfo.execute().getAction(RECORD_ACTION); - if ( record == null) { + + /** + * Records the show. + * @return True iff an attempt could be made to record the page. + * @throws PageException In case of problems recording the page. + */ + public boolean record() throws PageException { + Action record = _programInfo.execute().getAction(RECORD_ACTION); + if (record == null) { return false; } - record.execute(); - return true; + record.execute(); + return true; } - - public void accept(Visitor aVisitor) { + + /** + * Accepts the visitor. + * @param aVisitor Visitor. + */ + public void accept(Visitor aVisitor) { aVisitor.visitProgram(this); } - - /* (non-Javadoc) + + /* + * (non-Javadoc) + * * @see java.lang.Object#toString() */ @Override public String toString() { - return _interval + " - " + _name + " (" + _channel + "/" + _keywords + ")" + "\n" + - (INDENT + _description).replaceAll("\n", "\n" + INDENT); + return _interval + " - " + _name + " (" + _channel + "/" + _keywords + + ")" + "\n" + + (INDENT + _description).replaceAll("\n", "\n" + INDENT); } } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java index 71719799..5615ced2 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramConfigurationParser.java @@ -32,9 +32,9 @@ import org.wamblee.conditions.OrCondition; * Parse the configuration of desired programs. */ public class ProgramConfigurationParser { - - + private static final String ELEM_PROGRAM = "program"; + private static final String ELEM_PATTERN = "name"; /** @@ -52,10 +52,11 @@ public class ProgramConfigurationParser { Element root = document.getRootElement(); List> conditions = new ArrayList>(); - for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext(); ) { - Element program = (Element)i.next(); - String pattern = ".*" + program.element(ELEM_PATTERN).getText() + ".*"; - conditions.add(new ProgramNameMatcher(pattern)); + for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext();) { + Element program = (Element) i.next(); + String pattern = ".*" + program.element(ELEM_PATTERN).getText() + + ".*"; + conditions.add(new ProgramNameMatcher(pattern)); } return new OrCondition(conditions); } catch (DocumentException e) { diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramNameMatcher.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramNameMatcher.java index cd36079c..867d5b38 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramNameMatcher.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/ProgramNameMatcher.java @@ -26,14 +26,23 @@ import org.wamblee.conditions.Condition; */ public class ProgramNameMatcher implements Condition { + /** + * Pattern which describes interesting programs. + */ private Pattern _pattern; + /** + * Constructs the matcher. + * @param aPattern Pattern that describes interesting programs. + */ public ProgramNameMatcher(String aPattern) { _pattern = Pattern.compile(aPattern); } - /* (non-Javadoc) - * @see org.wamblee.crawler.kiss.ProgramMatcher#matches(org.wamblee.crawler.kiss.Program) + /** + * Determines if the program name matches. + * @param aProgram Program. + * @return True iff the program name matches. */ public boolean matches(Program aProgram) { Matcher matcher = _pattern.matcher(aProgram.getName().toLowerCase()); diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TVGuide.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TVGuide.java index fb73f750..319380a4 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TVGuide.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TVGuide.java @@ -20,20 +20,35 @@ import java.util.Collections; import java.util.List; /** - * + * The TV guide. */ public class TVGuide { + /** + * List of channels. + */ private List _channels; + /** + * Constructs the guide. + * @param aChannels Channels of the guide. + */ public TVGuide(List aChannels) { _channels = aChannels; } + /** + * Gets the channels. + * @return Channels. + */ public List getChannels() { return Collections.unmodifiableList(_channels); } + /** + * Accepts the visitor. + * @param aVisitor Visitor. + */ public void accept(Visitor aVisitor) { aVisitor.visitTvGuide(this); } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Time.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Time.java index b2f95f05..6679223b 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Time.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Time.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.kiss; @@ -20,36 +20,67 @@ import java.text.DecimalFormat; import java.text.NumberFormat; /** - * + * TIme at which a program starts or ends. */ public class Time { - - private int _hour; - private int _minute; - - public Time(int aHour, int aMinute) { + + /** + * Number of seconds per minute. + */ + private static final double SECONDS_PER_MINUTE = 60.0; + + /** + * Hour of the time. + */ + private int _hour; + + /** + * Minute of the hour. + */ + private int _minute; + + /** + * Constructs the time. + * @param aHour Hour. + * @param aMinute Minute. + */ + public Time(int aHour, int aMinute) { _hour = aHour; _minute = aMinute; } - - public int getHour() { - return _hour; + + /** + * Gets the hour. + * @return Hour. + */ + public int getHour() { + return _hour; } - - public int getMinute() { - return _minute; + + /** + * Gets te minute. + * @return Minute. + */ + public int getMinute() { + return _minute; } - /* (non-Javadoc) + /* + * (non-Javadoc) + * * @see java.lang.Object#toString() */ @Override public String toString() { NumberFormat format = new DecimalFormat("00"); - return format.format(_hour) + ":" + format.format(_minute); + return format.format(_hour) + ":" + format.format(_minute); } - + + /** + * Convert time to floating point value. Useful for comparing two times. + * @return Converted value. + */ float asFloat() { - return (float)_hour + (float)_minute/(float)60.0; + return (float) _hour + (float) _minute / (float) SECONDS_PER_MINUTE; } } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TimeInterval.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TimeInterval.java index 1e0e5151..0dbd570b 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TimeInterval.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/TimeInterval.java @@ -12,66 +12,93 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.kiss; /** - * + * Time interval. */ public class TimeInterval { - - private Time _begin; - private Time _end; - - public TimeInterval(Time aBegin, Time aEnd) { - _begin = aBegin; - _end = aEnd; + + /** + * Begin time. + */ + private Time _begin; + + /** + * End time. + */ + private Time _end; + + /** + * Construts the interval. + * @param aBegin Start time. + * @param aEnd End time. + */ + public TimeInterval(Time aBegin, Time aEnd) { + _begin = aBegin; + _end = aEnd; } - - public Time getBegin() { - return _begin; + + /** + * Gets the begin time. + * @return Begin time. + */ + public Time getBegin() { + return _begin; } - - public Time getEnd() { - return _end; + + /** + * Gets the end time. + * @return End time. + */ + public Time getEnd() { + return _end; } - /* (non-Javadoc) + /* + * (non-Javadoc) + * * @see java.lang.Object#toString() */ @Override public String toString() { - return _begin + " - " + _end; + return _begin + " - " + _end; } - + /** - * Determines if there is an overlap between the current interval and given one. + * Determines if there is an overlap between the current interval and given + * one. * - * @param aInterval Interval to compare with. + * @param aInterval + * Interval to compare with. * @return True iff there is overlap */ - public boolean overlap(TimeInterval aInterval) { - - if ( isUncertain() || aInterval.isUncertain()) { - // Optimistic assume there is no overlap if one of the intervals is uncertain. - return false; + public boolean overlap(TimeInterval aInterval) { + + if (isUncertain() || aInterval.isUncertain()) { + // Optimistic assume there is no overlap if one of the intervals is + // uncertain. + return false; } - - if ( _end.asFloat() <= aInterval._begin.asFloat() || - aInterval._end.asFloat() <= _begin.asFloat() ) { - return false; + + if (_end.asFloat() <= aInterval._begin.asFloat() + || aInterval._end.asFloat() <= _begin.asFloat()) { + return false; } - + return true; } - + /** - * Determines if the actual time that the program corresponds to is uncertain due to - * the representation of a period of more than 24 hours using a 24 hour clock. - * @return True iff the interval is uncertain. + * Determines if the actual time that the program corresponds to is + * uncertain due to the representation of a period of more than 24 hours + * using a 24 hour clock. + * + * @return True iff the interval is uncertain. */ - boolean isUncertain() { - return _begin.asFloat() > _end.asFloat(); + boolean isUncertain() { + return _begin.asFloat() > _end.asFloat(); } } diff --git a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Visitor.java b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Visitor.java index df9be674..d20a9125 100644 --- a/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Visitor.java +++ b/trunk/crawler/kiss/src/org/wamblee/crawler/kiss/Visitor.java @@ -17,13 +17,25 @@ package org.wamblee.crawler.kiss; /** - * + * Visitor of the TV guide. */ public interface Visitor { + /** + * Visits a program. + * @param aProgram Program. + */ void visitProgram(Program aProgram); + /** + * Visits a channel. + * @param aChannel Channel. + */ void visitChannel(Channel aChannel); + /** + * Visits the guide. + * @param aGuide Guide. + */ void visitTvGuide(TVGuide aGuide); }