<!-- Include the build header defining general properties -->
<!-- =============================================================================== -->
<property name="project.home" value="../.."/>
- <property name="module.name" value="crawler" />
+ <property name="module.name" value="crawler-basic" />
&header;
public abstract class AbstractPageRequest implements PageRequest {
private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class);
+
private static final String REDIRECT_HEADER = "Location";
-
- private int _maxTries;
- private int _maxDelay;
+
+ private int _maxTries;
+
+ private int _maxDelay;
private NameValuePair[] _params;
private String _xslt;
-
- private PrintStream _os;
- protected AbstractPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) {
- if ( aParams == null ) {
+ private PrintStream _os;
+
+ /**
+ * Constructs the request.
+ * @param aMaxTries Maximum retries to perform.
+ * @param aMaxDelay Maximum delay before executing a request.
+ * @param aParams Request parameters to use.
+ * @param aXslt XSLT used to convert the response.
+ * @param aOs Output stream for logging (if null then no logging is done).
+ */
+ protected AbstractPageRequest(int aMaxTries, int aMaxDelay,
+ NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+ if (aParams == null) {
throw new IllegalArgumentException("aParams is null");
}
- if ( aXslt == null ) {
+ if (aXslt == null) {
throw new IllegalArgumentException("aXslt is null");
}
_maxTries = aMaxTries;
_maxDelay = aMaxDelay;
_params = aParams;
_xslt = aXslt;
- _os = aOs;
+ _os = aOs;
}
-
- /* (non-Javadoc)
+
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.PageRequest#overrideXslt(java.lang.String)
*/
public void overrideXslt(String aXslt) {
- _xslt = aXslt;
+ _xslt = aXslt;
}
-
- protected NameValuePair[] getParameters() {
+
+ /**
+ * Gets the parameters for the request.
+ * @return Request parameters.
+ */
+ protected NameValuePair[] getParameters() {
return _params;
}
-
- protected Document executeMethod(HttpClient client, HttpMethod method) throws TransformerException {
- int triesLeft = _maxTries;
- while ( triesLeft > 0 ) {
+
+ /**
+ * Executes the request with a random delay and with a maximum number of
+ * retries.
+ * @param aClient HTTP client to use.
+ * @param aMethod Method representing the request.
+ * @return XML document describing the response.
+ * @throws TransformerException In case transformation of the HTML to XML fails.
+ */
+ protected Document executeMethod(HttpClient aClient, HttpMethod aMethod)
+ throws TransformerException {
+ int triesLeft = _maxTries;
+ while (triesLeft > 0) {
triesLeft--;
- try {
- return executeMethodWithoutRetries(client, method);
- } catch (TransformerException e) {
- if ( triesLeft == 0 ) {
+ try {
+ return executeMethodWithoutRetries(aClient, aMethod);
+ } catch (TransformerException e) {
+ if (triesLeft == 0) {
throw e;
}
}
}
throw new RuntimeException("Code should never reach this point");
}
-
- protected Document executeMethodWithoutRetries(HttpClient client, HttpMethod method) throws TransformerException {
+ /**
+ * Executes the request without doing any retries in case XSLT transformation
+ * fails.
+ * @param aClient HTTP client to use.
+ * @param aMethod Method to execute.
+ * @return XML document containing the result.
+ * @throws TransformerException In case transformation of the result to XML fails.
+ */
+ protected Document executeMethodWithoutRetries(HttpClient aClient,
+ HttpMethod aMethod) throws TransformerException {
try {
// Execute the method.
- method = executeWithRedirects(client, method);
+ aMethod = executeWithRedirects(aClient, aMethod);
// Transform the HTML into wellformed XML.
Tidy tidy = new Tidy();
tidy.setXHTML(true);
- tidy.setQuiet(true);
+ tidy.setQuiet(true);
tidy.setShowWarnings(false);
- if ( _os != null ) {
- _os.println("Content of '" + method.getURI() + "'");
+ if (_os != null) {
+ _os.println("Content of '" + aMethod.getURI() + "'");
_os.println();
}
- // We let jtidy produce raw output because the DOM it produces is
- // is not namespace aware. We let the XSLT processor parse the XML again
- // to ensure that the XSLT uses a namespace aware DOM tree. An alternative
- // is to configure namespace awareness of the XML parser in a system wide way.
+ // We let jtidy produce raw output because the DOM it produces is
+ // is not namespace aware. We let the XSLT processor parse the XML
+ // again
+ // to ensure that the XSLT uses a namespace aware DOM tree. An
+ // alternative
+ // is to configure namespace awareness of the XML parser in a system
+ // wide way.
ByteArrayOutputStream xhtml = new ByteArrayOutputStream();
- tidy.parse(method.getResponseBodyAsStream(), xhtml);
+ tidy.parse(aMethod.getResponseBodyAsStream(), xhtml);
_os.print(new String(xhtml.toByteArray()));
- // Obtaining the XML as dom is not used.
- //Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(),
- // _os);
- if ( _os != null ) {
+ // Obtaining the XML as dom is not used.
+ // Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(),
+ // _os);
+ if (_os != null) {
_os.println();
}
xhtml.flush();
byte[] xhtmlData = xhtml.toByteArray();
- Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt)));
+ Document transformed = new XSLT().transform(xhtmlData,
+ new FileResource(new File(_xslt)));
_os.println("Transformed result is: ");
- Transformer transformer = TransformerFactory.newInstance().newTransformer();
+ Transformer transformer = TransformerFactory.newInstance()
+ .newTransformer();
transformer.setParameter(OutputKeys.INDENT, "yes");
transformer.setParameter(OutputKeys.METHOD, "xml");
- transformer.transform(new DOMSource(transformed), new StreamResult(_os));
-
+ transformer.transform(new DOMSource(transformed), new StreamResult(
+ _os));
+
return transformed;
- } catch (HttpException e) {
- throw new RuntimeException(e.getMessage(), e);
- } catch (IOException e) {
+ } catch (HttpException e) {
+ throw new RuntimeException(e.getMessage(), e);
+ } catch (IOException e) {
throw new RuntimeException(e.getMessage(), e);
- } catch (TransformerConfigurationException e) {
+ } catch (TransformerConfigurationException e) {
throw new RuntimeException(e.getMessage(), e);
} finally {
// Release the connection.
- method.releaseConnection();
+ aMethod.releaseConnection();
}
}
-
- private void delay() {
+
+ /**
+ * Sleeps for a random time but no more than the maximum delay.
+ *
+ */
+ private void delay() {
try {
- Thread.sleep((long)((float)_maxDelay* Math.random()));
- } catch (InterruptedException e) {
- //
+ Thread.sleep((long) ((float) _maxDelay * Math.random()));
+ } catch (InterruptedException e) {
+ return; // to satisfy checkstyle
}
}
-
/**
- * @param aClient
- * @param aMethod
- * @throws IOException
- * @throws HttpException
+ * Executes the request and follows redirects if needed.
+ * @param aClient HTTP client to use.
+ * @param aMethod Method to use.
+ * @return Final HTTP method used (differs from the parameter passed in in case
+ * of redirection).
+ * @throws IOException In case of network problems.
*/
- private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException {
+ private HttpMethod executeWithRedirects(HttpClient aClient,
+ HttpMethod aMethod) throws IOException {
delay();
int statusCode = aClient.executeMethod(aMethod);
- switch (statusCode) {
+ switch (statusCode) {
case HttpStatus.SC_OK: {
return aMethod;
}
aMethod.releaseConnection();
Header header = aMethod.getResponseHeader(REDIRECT_HEADER);
aMethod = new GetMethod(header.getValue());
- return executeWithRedirects(aClient, aMethod); // TODO protect against infinite recursion.
+ return executeWithRedirects(aClient, aMethod); // TODO protect
+ // against infinite
+ // recursion.
}
default: {
throw new RuntimeException("Method failed: "
/**
* Executes the action.
- * @return
+ * @return New page as a result of the action.
+ * @throws PageException In case of an error obtaining the page.
*/
Page execute() throws PageException;
/**
* Gets a description of the action. THe element returned is the action element
* itself.
- * @return
+ * @return Content as XML.
*/
Element getContent();
}
/**
* Gets the page request based on the URL.
- * @param aUrl
+ * @param aUrl Url of the page to retrieve.
* @return Page request.
*/
PageRequest getRequest(String aUrl);
* Gets the content for a specific page.
* @param aUrl Url of page.
* @return Page to retrieve.
+ * @throws PageException In case of problems retrieving the page.
*/
Page getPage(String aUrl) throws PageException;
* Gets the content for a specific page.
* @param aUrl Url of page.
* @param aType Type of page.
- * @return Page.
+ * @return Page.
+ * @throws PageException In case of problems retrieving the page.
*/
Page getPage(String aUrl, PageType aType) throws PageException;
}
*/
public class GetPageRequest extends AbstractPageRequest {
+ /**
+ * Constructs the request.
+ * @param aMaxTries Maximum number of retries.
+ * @param aMaxDelay Maximum delay before executing the request.
+ * @param aParams Request parameters to use.
+ * @param aXslt XSLT to use.
+ */
public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
super(aMaxTries, aMaxDelay, aParams, aXslt, null);
}
+ /**
+ * Constructs the request.
+ * @param aMaxTries Maximum number of retries.
+ * @param aMaxDelay Maximum delay before executing the request.
+ * @param aParams Request parameters to use.
+ * @param aXslt XSLT to use.
+ * @param aOs Logging output stream to use.
+ */
public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) {
super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
}
package org.wamblee.crawler;
/**
- *
+ * Exception thrown when there is a problem in retrieving or transforming the
+ * page.
*/
public class PageException extends Exception {
+ /**
+ * Constructs the exception.
+ * @param aMsg Message.
+ */
public PageException(String aMsg) {
super(aMsg);
}
+ /**
+ * Constructs the exception.
+ * @param aMsg Message.
+ * @param aCause Cause of the exception.
+ */
public PageException(String aMsg, Throwable aCause) {
super(aMsg, aCause);
}
public interface PageRequest {
/**
- * Gets a page as an XML document.
+ * Gets a page as an XML document.
+ * @param aUrl Url of the page.
* @param aClient Http client to use.
* @return Client.
+ * @throws PageException In case of problems retrieving the page.
*/
Document execute(String aUrl, HttpClient aClient) throws PageException;
/**
- * Overrides the Xslt to use.
+ * Overrides the Xslt to use. This is used when the transformed page specifies
+ * the page type explicitly for an action.
* @param aXslt Xslt to use.
*/
void overrideXslt(String aXslt);
package org.wamblee.crawler;
/**
- *
+ * Represents the type of a page determining how the HTML should be transformed into
+ * XML.
*/
public class PageType {
+ /**
+ * Type string.
+ */
private String _type;
+ /**
+ * Constructs the type.
+ * @param aType Type.
+ */
public PageType(String aType) {
_type = aType;
}
+ /**
+ * Gets the type.
+ * @return Type.
+ */
public String getType() {
return _type;
}
*/
public class PostPageRequest extends AbstractPageRequest {
+ /**
+ * Constructs the request.
+ * @param aMaxTries Maximum number of retries.
+ * @param aMaxDelay Maximum delay before executing the request.
+ * @param aParams Request parameters to use.
+ * @param aXslt XSLT to use.
+ */
public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
super(aMaxTries, aMaxDelay, aParams, aXslt, null);
}
+ /**
+ * Constructs the request.
+ * @param aMaxTries Maximum number of retries.
+ * @param aMaxDelay Maximum delay before executing the request.
+ * @param aParams Request parameters to use.
+ * @param aXslt XSLT to use.
+ * @param aOs Logging output stream to use.
+ */
public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt,
PrintStream aOs) {
super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
/**
* Entry point for the crawler.
*/
-public class App {
+public final class App {
+
+ /**
+ * Disabled constructor.
+ *
+ */
+ private App() {
+ // Empty
+ }
private static final Log LOG = LogFactory.getLog(App.class);
private static final String LOG_FILE = "crawler.log";
- public static void main(String[] args) throws Exception {
- String configFileName = args[0];
- String starturl = args[1];
+ public static void main(String[] aArgs) throws Exception {
+ String configFileName = aArgs[0];
+ String starturl = aArgs[1];
FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
PrintStream os = new PrintStream(fos);
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.impl;
*
*/
class ConfigItem<ValueType> {
-
+
private Pattern _pattern;
- private ValueType _value;
-
- protected ConfigItem(String aPattern, ValueType aValue) {
+
+ private ValueType _value;
+
+ protected ConfigItem(String aPattern, ValueType aValue) {
_pattern = Pattern.compile(aPattern);
_value = aValue;
}
-
- protected ValueType match(String aValue) {
- if ( !_pattern.matcher(aValue).matches() ) {
- return null;
+
+ protected ValueType match(String aValue) {
+ if (!_pattern.matcher(aValue).matches()) {
+ return null;
}
return _value;
}
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.impl;
import org.wamblee.crawler.PageRequest;
import org.wamblee.crawler.PageType;
-
/**
- * Implementation of the configuration for the crawler.
+ * Implementation of the configuration for the crawler.
*/
public class ConfigurationImpl implements Configuration {
-
+
private List<UrlConfig> _urlConfig;
- private List<PageTypeConfig> _pageTypeConfig;
-
- public ConfigurationImpl(List<UrlConfig> aUrlConfig, List<PageTypeConfig> aPageTypeConfig) {
- _urlConfig = aUrlConfig;
- _pageTypeConfig = aPageTypeConfig;
+
+ private List<PageTypeConfig> _pageTypeConfig;
+
+ public ConfigurationImpl(List<UrlConfig> aUrlConfig,
+ List<PageTypeConfig> aPageTypeConfig) {
+ _urlConfig = aUrlConfig;
+ _pageTypeConfig = aPageTypeConfig;
}
- /* (non-Javadoc)
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Configuration#getRequest(java.lang.String)
*/
public PageRequest getRequest(String aUrl) {
-
- for (UrlConfig config: _urlConfig) {
- PageRequest request = config.getRequest(aUrl);
- if ( request != null ) {
- return request;
+
+ for (UrlConfig config : _urlConfig) {
+ PageRequest request = config.getRequest(aUrl);
+ if (request != null) {
+ return request;
}
}
- throw new RuntimeException("No configuration matched the URL '" + aUrl + "'");
+ throw new RuntimeException("No configuration matched the URL '" + aUrl
+ + "'");
}
-
- /* (non-Javadoc)
+
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Configuration#getRequest(org.wamblee.crawler.PageType)
*/
public PageRequest getRequest(PageType aType) {
- for (PageTypeConfig config: _pageTypeConfig) {
- PageRequest request = config.getRequest(aType.getType());
- if ( request != null ) {
- return request;
+ for (PageTypeConfig config : _pageTypeConfig) {
+ PageRequest request = config.getRequest(aType.getType());
+ if (request != null) {
+ return request;
}
}
- throw new RuntimeException("No configuration matched type '" + aType + "'");
+ throw new RuntimeException("No configuration matched type '" + aType
+ + "'");
}
}
* Parsing of the configuration from an XML file.
*/
public class ConfigurationParser {
-
- private static final String ELEM_URL = "url";
+
+ private static final String ELEM_URL = "url";
+
private static final String ELEM_TYPE = "type";
- private static final String ELEM_PATTERN = "pattern";
- private static final String ELEM_METHOD= "method";
- private static final String ELEM_XSLT = "xslt";
- private static final String ELEM_PARAM = "param";
+
+ private static final String ELEM_PATTERN = "pattern";
+
+ private static final String ELEM_METHOD = "method";
+
+ private static final String ELEM_XSLT = "xslt";
+
+ private static final String ELEM_PARAM = "param";
+
private static final String AT_NAME = "name";
+
private static final String AT_VALUE = "value";
-
+
private static final String METHOD_POST = "post";
+
private static final String METHOD_GET = "get";
-
- private static final int MAX_TRIES = 3;
+
+ private static final int MAX_TRIES = 3;
+
private static final int MAX_DELAY = 5000;
-
- private PrintStream _os;
-
+
+ private PrintStream _os;
+
public ConfigurationParser(PrintStream aOs) {
_os = aOs;
}
try {
SAXReader reader = new SAXReader();
Document document = reader.read(aStream);
-
- Element root = document.getRootElement();
+
+ Element root = document.getRootElement();
List<UrlConfig> urlConfigs = parseUrlConfigs(root);
- List<PageTypeConfig> pageTypeConfigs = parsePageTypeConfigs(root);
+ List<PageTypeConfig> pageTypeConfigs = parsePageTypeConfigs(root);
return new ConfigurationImpl(urlConfigs, pageTypeConfigs);
} catch (DocumentException e) {
throw new RuntimeException("Problem parsing config file", e);
}
/**
- * @param root
+ * @param aRoot
* @return
*/
- private List<UrlConfig> parseUrlConfigs(Element root) {
+ private List<UrlConfig> parseUrlConfigs(Element aRoot) {
List<UrlConfig> configs = new ArrayList<UrlConfig>();
- for (Iterator i = root.elementIterator(ELEM_URL); i.hasNext(); ) {
- Element url = (Element)i.next();
+ for (Iterator i = aRoot.elementIterator(ELEM_URL); i.hasNext();) {
+ Element url = (Element) i.next();
UrlConfig config = parseUrlConfig(url);
configs.add(config);
}
return configs;
}
-
- private List<PageTypeConfig> parsePageTypeConfigs(Element root) {
+
+ private List<PageTypeConfig> parsePageTypeConfigs(Element aRoot) {
List<PageTypeConfig> configs = new ArrayList<PageTypeConfig>();
- for (Iterator i = root.elementIterator(ELEM_TYPE); i.hasNext(); ) {
- Element url = (Element)i.next();
+ for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) {
+ Element url = (Element) i.next();
PageTypeConfig config = parsePageTypeConfig(url);
configs.add(config);
}
return configs;
}
-
- private UrlConfig parseUrlConfig(Element aUrlElem) {
+
+ private UrlConfig parseUrlConfig(Element aUrlElem) {
String pattern = aUrlElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aUrlElem);
return new UrlConfig(pattern, request);
}
-
- private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
+
+ private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
String pattern = aTypeElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aTypeElem);
return new PageTypeConfig(pattern, request);
* @return
*/
private PageRequest parseRequestConfig(Element aUrlElem) {
- String method = aUrlElem.elementText(ELEM_METHOD);
+ String method = aUrlElem.elementText(ELEM_METHOD);
String xslt = aUrlElem.elementText(ELEM_XSLT);
List<NameValuePair> params = new ArrayList<NameValuePair>();
- for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext(); ) {
- Element paramElem = (Element)i.next();
+ for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext();) {
+ Element paramElem = (Element) i.next();
NameValuePair param = parseParameter(paramElem);
params.add(param);
}
-
+
NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]);
- PageRequest request;
- if ( METHOD_POST.equals(method)) {
- request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
- }
- else if ( METHOD_GET.equals(method) || method == null ){
- request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
- } else {
- throw new RuntimeException("Unknown request method '" + method + "'. Only " +
- METHOD_GET + " and " + METHOD_POST + " are supported");
+ PageRequest request;
+ if (METHOD_POST.equals(method)) {
+ request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray,
+ xslt, _os);
+ } else if (METHOD_GET.equals(method) || method == null) {
+ request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray,
+ xslt, _os);
+ } else {
+ throw new RuntimeException("Unknown request method '" + method
+ + "'. Only " + METHOD_GET + " and " + METHOD_POST
+ + " are supported");
}
return request;
}
-
- private NameValuePair parseParameter(Element aParam) {
+
+ private NameValuePair parseParameter(Element aParam) {
String name = aParam.attributeValue(AT_NAME);
String value = aParam.attributeValue(AT_VALUE);
return new NameValuePair(name, value);
private Crawler _crawler;
private Element _content;
-
- private Action[] _actions;
+
+ private Action[] _actions;
/**
* Constructs a page.
_content = aContent;
_actions = computeActions();
}
-
+
/*
* (non-Javadoc)
*
String name = elem.attributeValue(ATT_NAME);
String href = elem.attributeValue(ATT_HREF);
String type = elem.attributeValue(ATT_TYPE);
- if (type == null ) {
+ if (type == null) {
names.add(new ActionImpl(_crawler, elem, name, href));
- }
- else {
- names.add(new ActionImpl(_crawler, elem, name, href, new PageType(type)));
+ } else {
+ names.add(new ActionImpl(_crawler, elem, name, href,
+ new PageType(type)));
}
}
return names.toArray(new Action[0]);
return _content;
}
- /* (non-Javadoc)
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Page#getActions()
*/
public Action[] getActions() {
return _actions;
}
-
+
/*
- * (non-Javadoc)
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Page#getAction(java.lang.String)
*/
public Action getAction(String aName) {
List<Action> results = new ArrayList<Action>();
- for (Action action: _actions) {
- if ( action.getName().equals(aName)) {
+ for (Action action : _actions) {
+ if (action.getName().equals(aName)) {
results.add(action);
}
}
+package org.wamblee.crawler;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
/**
*
*/
-public class Main {
+public final class Main {
+
+ /**
+ *
+ */
+ private static final int PROXY_PORT = 10000;
+
+ /**
+ *
+ */
+ private static final int MAX_REDIRECTS = 10;
+
+ /**
+ * Disabled constructor.
+ *
+ */
+ private Main() {
+ // Empty
+ }
private static final String BASE = "http://epg.kml.kiss-technology.com/";
- private static int count = 0;
+ private static int COUNT = 0;
public static void main(String[] aArgs) {
HttpClientParams clientParams = new HttpClientParams();
- clientParams.setIntParameter(HttpClientParams.MAX_REDIRECTS, 10);
+ clientParams.setIntParameter(HttpClientParams.MAX_REDIRECTS, MAX_REDIRECTS);
clientParams.setBooleanParameter(HttpClientParams.REJECT_RELATIVE_REDIRECT, false);
HttpClient client = new HttpClient(clientParams);
- client.getHostConfiguration().setProxy("localhost", 10000);
+ client.getHostConfiguration().setProxy("localhost", PROXY_PORT);
clientParams = client.getParams();
Object obj = clientParams.getParameter(HttpClientParams.MAX_REDIRECTS);
}
/**
- * @param client
- * @param method
+ * @param aClient
+ * @param aMethod
*/
- private static int executeMethod(HttpClient client, HttpMethod method) {
+ private static int executeMethod(HttpClient aClient, HttpMethod aMethod) {
//method.setFollowRedirects(true);
try {
// Execute the method.
- int statusCode = client.executeMethod(method);
+ int statusCode = aClient.executeMethod(aMethod);
if (statusCode != HttpStatus.SC_OK) {
- System.err.println("Method failed: " + method.getStatusLine());
+ System.err.println("Method failed: " + aMethod.getStatusLine());
}
// Read the response body.
- String filename = "output" + count++;
+ String filename = "output" + COUNT++;
FileOutputStream os = new FileOutputStream(new File(filename));
//os.write(method.getResponseBody());
Tidy tidy = new Tidy();
tidy.setXHTML(true);
- tidy.parse(method.getResponseBodyAsStream(), os);
+ tidy.parse(aMethod.getResponseBodyAsStream(), os);
os.close();
System.out.println("Written response to file: " + filename);
return statusCode;
throw new RuntimeException("Fatal transport error: " + e.getMessage());
} finally {
// Release the connection.
- method.releaseConnection();
+ aMethod.releaseConnection();
}
}
import java.util.List;
/**
- *
+ * Abstract visitor of the tv guide with default looping behavior.
*/
public abstract class AbstractVisitor implements Visitor {
+ /**
+ * Constructs the visitor.
+ *
+ */
protected AbstractVisitor() {
// Empty
}
- /* (non-Javadoc)
- * @see org.wamblee.crawler.kiss.Visitor#visitChannel(org.wamblee.crawler.kiss.Channel)
+ /**
+ * Visits the channel by visiting all programs of the channel.
+ * @param aChannel Channel to visit.
*/
public void visitChannel(Channel aChannel) {
List<Program> programs = aChannel.getPrograms();
}
}
- /* (non-Javadoc)
- * @see org.wamblee.crawler.kiss.Visitor#visitTvGuide(org.wamblee.crawler.kiss.TVGuide)
+ /**
+ * Visits the TV guide by visiting all channels of the guide.
+ * @param aGuide TV guide to visit.
*/
public void visitTvGuide(TVGuide aGuide) {
List<Channel> channels = aGuide.getChannels();
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.kiss;
import java.util.List;
/**
- *
+ * Represents the programme for a tv channel.
*/
public class Channel {
-
+
+ /**
+ * TV channel name.
+ */
private String _name;
- private List<Program> _programs;
-
- public Channel(String aName, List<Program> aPrograms) {
- _name = aName;
- _programs = aPrograms;
+
+ /**
+ * List of programs in chronological order.
+ */
+ private List<Program> _programs;
+
+ /**
+ * Constructs the channel.
+ * @param aName Channel name.
+ * @param aPrograms Programs.
+ */
+ public Channel(String aName, List<Program> aPrograms) {
+ _name = aName;
+ _programs = aPrograms;
}
-
- public String getName() {
- return _name;
+
+ /**
+ * Gets the channel name.
+ * @return channel name.
+ */
+ public String getName() {
+ return _name;
}
-
- public List<Program> getPrograms() {
+
+ /**
+ * Gets the list of program.
+ * @return Programs.
+ */
+ public List<Program> getPrograms() {
return Collections.unmodifiableList(_programs);
}
-
- public void accept(Visitor aVisitor) {
+
+ /**
+ * Accepts a visitor.
+ * @param aVisitor Visitor.
+ */
+ public void accept(Visitor aVisitor) {
aVisitor.visitChannel(this);
}
}
import javax.mail.MessagingException;
import javax.mail.Session;
import javax.mail.Transport;
-import javax.mail.internet.AddressException;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeMessage;
import org.wamblee.crawler.impl.CrawlerImpl;
/**
+ * The KiSS crawler for automatic recording of interesting TV shows.
*
*/
public class KissCrawler {
private static final Log LOG = LogFactory.getLog(KissCrawler.class);
+ /**
+ * Log file name for the crawler.
+ */
private static final String LOG_FILE = "kiss.log";
+ /**
+ * Start URL of the electronic programme guide.
+ */
private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
+ /**
+ * Crawler configuration file.
+ */
private static final String CRAWLER_CONFIG = "config.xml";
+ /**
+ * Configuration file describing interesting programs.
+ */
private static final String PROGRAM_CONFIG = "programs.xml";
+ /**
+ * Regular expression for matching time interval strings in the
+ * retrieved pages.
+ */
private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
+ /**
+ * Compiled pattern for the time regular expression.
+ */
private Pattern _pattern;
+ /**
+ * Runs the KiSS crawler.
+ * @param aArgs Arguments, currently all ignored because they are hardcoded.
+ * @throws Exception In case of problems.
+ */
+ public static void main(String[] aArgs) throws Exception {
+ new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
+ }
+
+ /**
+ * Constructs the crawler. This retrieves the TV guide by crawling the
+ * KiSS EPG guide, filters the guide for interesting programs, tries to
+ * record them, and sends a summary mail to the user.
+ * @param aStartUrl Start URL of the electronic programme guide.
+ * @param aCrawlerConfig Configuration file for the crawler.
+ * @param aProgramConfig Configuration file describing interesting shows.
+ * @throws IOException In case of problems reading files.
+ * @throws MessagingException In case of problems sending a mail notification.
+ */
public KissCrawler(String aStartUrl, String aCrawlerConfig,
- String aProgramConfig) throws IOException, AddressException,
- MessagingException {
+ String aProgramConfig) throws IOException, MessagingException {
_pattern = Pattern.compile(TIME_REGEX);
}
/**
- * @param programCondition
- * @param guide
- * @throws AddressException
- * @throws MessagingException
+ * Records interesting shows.
+ * @param aProgramCondition Condition determining which shows are interesting.
+ * @param aGuide Television guide.
+ * @throws MessagingException In case of problems sending a summary mail.
*/
- private void recordInterestingShows(Condition<Program> programCondition,
- TVGuide guide) throws AddressException, MessagingException {
- MatchVisitor matcher = new MatchVisitor(programCondition);
- guide.accept(matcher);
+ private void recordInterestingShows(Condition<Program> aProgramCondition,
+ TVGuide aGuide) throws MessagingException {
+ MatchVisitor matcher = new MatchVisitor(aProgramCondition);
+ aGuide.accept(matcher);
List<Program> programs = matcher.getMatches();
String recorded = "";
String notRecorded = "";
}
/**
- * @param aCrawlerConfig
- * @param os
- * @param client
- * @return
- * @throws FileNotFoundException
+ * Creates the crawler.
+ * @param aCrawlerConfig Crawler configuration file.
+ * @param aOs Logging output stream for the crawler.
+ * @param aClient HTTP Client to use.
+ * @return Crawler.
+ * @throws FileNotFoundException In case configuration files cannot be found.
*/
- private Crawler createCrawler(String aCrawlerConfig, PrintStream os,
- HttpClient client) throws FileNotFoundException {
- ConfigurationParser parser = new ConfigurationParser(os);
+ private Crawler createCrawler(String aCrawlerConfig, PrintStream aOs,
+ HttpClient aClient) throws FileNotFoundException {
+ ConfigurationParser parser = new ConfigurationParser(aOs);
InputStream crawlerConfigFile = new FileInputStream(new File(
aCrawlerConfig));
Configuration config = parser.parse(crawlerConfigFile);
- Crawler crawler = new CrawlerImpl(client, config);
+ Crawler crawler = new CrawlerImpl(aClient, config);
return crawler;
}
/**
- * @param aStartUrl
- * @param crawler
- * @return
+ * Gets the start page of the electronic programme guide. This involves login and
+ * navigation to a suitable start page after logging in.
+ * @param aStartUrl URL of the electronic programme guide.
+ * @param aCrawler Crawler to use.
+ * @return Starting page.
*/
- private Page getStartPage(String aStartUrl, Crawler crawler) {
+ private Page getStartPage(String aStartUrl, Crawler aCrawler) {
try {
- Page page = crawler.getPage(aStartUrl);
+ Page page = aCrawler.getPage(aStartUrl);
return page.getAction("channels-favorites").execute();
} catch (PageException e) {
throw new RuntimeException(
}
}
- public static void main(String[] args) throws Exception {
- new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
- }
-
- private void showPage(Page aPage) {
- Action[] links = aPage.getActions();
- for (Action link : links) {
- System.out.println("Link found '" + link.getName() + "'");
- }
- Element element = aPage.getContent();
- System.out.println("Retrieved content: " + element.asXML());
- }
-
- private TVGuide createGuide(Page page) {
+ /**
+ * Creates the TV guide by web crawling.
+ * @param aPage Starting page.
+ * @return TV guide.
+ */
+ private TVGuide createGuide(Page aPage) {
LOG.info("Obtaining full TV guide");
- Action[] actions = page.getActions();
+ Action[] actions = aPage.getActions();
List<Channel> channels = new ArrayList<Channel>();
for (Action action : actions) {
try {
return new TVGuide(channels);
}
+ /**
+ * Create channel information for a specific channel.
+ * @param aChannel Channel name.
+ * @param aPage Starting page for the channel.
+ * @return Channel.
+ */
private Channel createChannel(String aChannel, Page aPage) {
LOG.info("Obtaining program for " + aChannel);
Action[] programActions = aPage.getActions();
return new Channel(aChannel, programs);
}
- private void sendMail(String aText) throws AddressException,
- MessagingException {
+ /**
+ * Sends a summary mail to the user.
+ * @param aText Text of the mail.
+ * @throws MessagingException In case of problems sending mail.
+ */
+ private void sendMail(String aText) throws MessagingException {
Properties props = new Properties();
props.put("mail.transport.protocol", "smtp");
props.put("mail.smtp.host", "falcon");
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.kiss;
import org.wamblee.conditions.Condition;
/**
- *
+ * Visitor which determines the interesting programs in the TV guide.
*/
public class MatchVisitor extends AbstractVisitor {
-
- private Condition _matcher;
+
+ /**
+ * Criterion that determines which programs are interesting.
+ */
+ private Condition _matcher;
+
+ /**
+ * List of interesting programs.
+ */
private List<Program> _programs;
-
- public MatchVisitor(Condition aMatcher) {
- _matcher = aMatcher;
+
+ /**
+ * Constructs the visitor.
+ * @param aMatcher Condition describing interesting programs.
+ */
+ public MatchVisitor(Condition aMatcher) {
+ _matcher = aMatcher;
_programs = new ArrayList<Program>();
}
-
- /* (non-Javadoc)
+
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.kiss.Visitor#visitProgram(org.wamblee.crawler.kiss.Program)
*/
public void visitProgram(Program aProgram) {
- if ( _matcher.matches(aProgram)) {
+ if (_matcher.matches(aProgram)) {
_programs.add(aProgram);
}
}
-
- public List<Program> getMatches() {
- return _programs;
+
+ /**
+ * Gets the list of interesting programs. To be called after applying
+ * the visitor on a tv guide.
+ * @return List of interesting programs.
+ */
+ public List<Program> getMatches() {
+ return _programs;
}
-
}
import java.io.PrintStream;
/**
- *
+ * Print visitor for pretty printing the TV guide.
*/
public class PrintVisitor extends AbstractVisitor {
+ /**
+ * Stream to print the guide on.
+ */
private PrintStream _stream;
+ /**
+ * Constructs the print visitor.
+ * @param aStream Stream to print on.
+ */
public PrintVisitor(PrintStream aStream) {
_stream = aStream;
}
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.kiss;
import org.wamblee.crawler.PageException;
/**
- *
+ * Represents a television program.
*/
public class Program {
-
+
+ /**
+ * Name of the record action on the program details page.
+ */
private static final String RECORD_ACTION = "record";
+
+ /**
+ * Indent string to use for pretty printing.
+ */
private static final String INDENT = " ";
-
- private String _channel;
+
+ /**
+ * Channel the program is on.
+ */
+ private String _channel;
+
+ /**
+ * Program name.
+ */
private String _name;
- private String _description;
+
+ /**
+ * Program description.
+ */
+ private String _description;
+
+ /**
+ * Keywords or classification of the program.
+ */
private String _keywords;
+
+ /**
+ * Time interval for the program (from/to).
+ */
private TimeInterval _interval;
- private Action _programInfo;
-
- public Program(String aChannel, String aName, String aDescription, String aKeywords, TimeInterval aInterval, Action aProgramInfo) {
- _channel = aChannel;
- _name = aName;
+
+ /**
+ * Action to execute to obtain program information and/or record the program.
+ */
+ private Action _programInfo;
+
+ /**
+ * Constructs the program.
+ * @param aChannel Channel name.
+ * @param aName Program name.
+ * @param aDescription Description.
+ * @param aKeywords Keywords/classification.
+ * @param aInterval Time interval.
+ * @param aProgramInfo Action to execute for detailed program information or
+ * for recording the page.
+ */
+ public Program(String aChannel, String aName, String aDescription,
+ String aKeywords, TimeInterval aInterval, Action aProgramInfo) {
+ _channel = aChannel;
+ _name = aName;
_description = aDescription;
- _keywords = aKeywords;
+ _keywords = aKeywords;
_interval = aInterval;
- _programInfo = aProgramInfo;
+ _programInfo = aProgramInfo;
}
-
- public String getChannel() {
- return _channel;
+
+ /**
+ * Gets the channel.
+ * @return Channel.
+ */
+ public String getChannel() {
+ return _channel;
}
-
- public String getName() {
- return _name;
+
+ /**
+ * Gets the program name.
+ * @return Name.
+ */
+ public String getName() {
+ return _name;
}
-
- public String getDescription() {
+
+ /**
+ * Gets the description.
+ * @return Description.
+ */
+ public String getDescription() {
return _description;
}
-
- public String getKeywords() {
- return _keywords;
+
+ /**
+ * Gets the keywords/classification.
+ * @return Keywords/classification
+ */
+ public String getKeywords() {
+ return _keywords;
}
-
- public TimeInterval getInterval() {
- return _interval;
+
+ /**
+ * Gets the time interval.
+ * @return Time interval.
+ */
+ public TimeInterval getInterval() {
+ return _interval;
}
-
- public boolean record() throws PageException {
- Action record = _programInfo.execute().getAction(RECORD_ACTION);
- if ( record == null) {
+
+ /**
+ * Records the show.
+ * @return True iff an attempt could be made to record the page.
+ * @throws PageException In case of problems recording the page.
+ */
+ public boolean record() throws PageException {
+ Action record = _programInfo.execute().getAction(RECORD_ACTION);
+ if (record == null) {
return false;
}
- record.execute();
- return true;
+ record.execute();
+ return true;
}
-
- public void accept(Visitor aVisitor) {
+
+ /**
+ * Accepts the visitor.
+ * @param aVisitor Visitor.
+ */
+ public void accept(Visitor aVisitor) {
aVisitor.visitProgram(this);
}
-
- /* (non-Javadoc)
+
+ /*
+ * (non-Javadoc)
+ *
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
- return _interval + " - " + _name + " (" + _channel + "/" + _keywords + ")" + "\n" +
- (INDENT + _description).replaceAll("\n", "\n" + INDENT);
+ return _interval + " - " + _name + " (" + _channel + "/" + _keywords
+ + ")" + "\n"
+ + (INDENT + _description).replaceAll("\n", "\n" + INDENT);
}
}
* Parse the configuration of desired programs.
*/
public class ProgramConfigurationParser {
-
-
+
private static final String ELEM_PROGRAM = "program";
+
private static final String ELEM_PATTERN = "name";
/**
Element root = document.getRootElement();
List<Condition<Program>> conditions = new ArrayList<Condition<Program>>();
- for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext(); ) {
- Element program = (Element)i.next();
- String pattern = ".*" + program.element(ELEM_PATTERN).getText() + ".*";
- conditions.add(new ProgramNameMatcher(pattern));
+ for (Iterator i = root.elementIterator(ELEM_PROGRAM); i.hasNext();) {
+ Element program = (Element) i.next();
+ String pattern = ".*" + program.element(ELEM_PATTERN).getText()
+ + ".*";
+ conditions.add(new ProgramNameMatcher(pattern));
}
return new OrCondition<Program>(conditions);
} catch (DocumentException e) {
*/
public class ProgramNameMatcher implements Condition<Program> {
+ /**
+ * Pattern which describes interesting programs.
+ */
private Pattern _pattern;
+ /**
+ * Constructs the matcher.
+ * @param aPattern Pattern that describes interesting programs.
+ */
public ProgramNameMatcher(String aPattern) {
_pattern = Pattern.compile(aPattern);
}
- /* (non-Javadoc)
- * @see org.wamblee.crawler.kiss.ProgramMatcher#matches(org.wamblee.crawler.kiss.Program)
+ /**
+ * Determines if the program name matches.
+ * @param aProgram Program.
+ * @return True iff the program name matches.
*/
public boolean matches(Program aProgram) {
Matcher matcher = _pattern.matcher(aProgram.getName().toLowerCase());
import java.util.List;
/**
- *
+ * The TV guide.
*/
public class TVGuide {
+ /**
+ * List of channels.
+ */
private List<Channel> _channels;
+ /**
+ * Constructs the guide.
+ * @param aChannels Channels of the guide.
+ */
public TVGuide(List<Channel> aChannels) {
_channels = aChannels;
}
+ /**
+ * Gets the channels.
+ * @return Channels.
+ */
public List<Channel> getChannels() {
return Collections.unmodifiableList(_channels);
}
+ /**
+ * Accepts the visitor.
+ * @param aVisitor Visitor.
+ */
public void accept(Visitor aVisitor) {
aVisitor.visitTvGuide(this);
}
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.kiss;
import java.text.NumberFormat;
/**
- *
+ * TIme at which a program starts or ends.
*/
public class Time {
-
- private int _hour;
- private int _minute;
-
- public Time(int aHour, int aMinute) {
+
+ /**
+ * Number of seconds per minute.
+ */
+ private static final double SECONDS_PER_MINUTE = 60.0;
+
+ /**
+ * Hour of the time.
+ */
+ private int _hour;
+
+ /**
+ * Minute of the hour.
+ */
+ private int _minute;
+
+ /**
+ * Constructs the time.
+ * @param aHour Hour.
+ * @param aMinute Minute.
+ */
+ public Time(int aHour, int aMinute) {
_hour = aHour;
_minute = aMinute;
}
-
- public int getHour() {
- return _hour;
+
+ /**
+ * Gets the hour.
+ * @return Hour.
+ */
+ public int getHour() {
+ return _hour;
}
-
- public int getMinute() {
- return _minute;
+
+ /**
+ * Gets te minute.
+ * @return Minute.
+ */
+ public int getMinute() {
+ return _minute;
}
- /* (non-Javadoc)
+ /*
+ * (non-Javadoc)
+ *
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
NumberFormat format = new DecimalFormat("00");
- return format.format(_hour) + ":" + format.format(_minute);
+ return format.format(_hour) + ":" + format.format(_minute);
}
-
+
+ /**
+ * Convert time to floating point value. Useful for comparing two times.
+ * @return Converted value.
+ */
float asFloat() {
- return (float)_hour + (float)_minute/(float)60.0;
+ return (float) _hour + (float) _minute / (float) SECONDS_PER_MINUTE;
}
}
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.kiss;
/**
- *
+ * Time interval.
*/
public class TimeInterval {
-
- private Time _begin;
- private Time _end;
-
- public TimeInterval(Time aBegin, Time aEnd) {
- _begin = aBegin;
- _end = aEnd;
+
+ /**
+ * Begin time.
+ */
+ private Time _begin;
+
+ /**
+ * End time.
+ */
+ private Time _end;
+
+ /**
+ * Construts the interval.
+ * @param aBegin Start time.
+ * @param aEnd End time.
+ */
+ public TimeInterval(Time aBegin, Time aEnd) {
+ _begin = aBegin;
+ _end = aEnd;
}
-
- public Time getBegin() {
- return _begin;
+
+ /**
+ * Gets the begin time.
+ * @return Begin time.
+ */
+ public Time getBegin() {
+ return _begin;
}
-
- public Time getEnd() {
- return _end;
+
+ /**
+ * Gets the end time.
+ * @return End time.
+ */
+ public Time getEnd() {
+ return _end;
}
- /* (non-Javadoc)
+ /*
+ * (non-Javadoc)
+ *
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
- return _begin + " - " + _end;
+ return _begin + " - " + _end;
}
-
+
/**
- * Determines if there is an overlap between the current interval and given one.
+ * Determines if there is an overlap between the current interval and given
+ * one.
*
- * @param aInterval Interval to compare with.
+ * @param aInterval
+ * Interval to compare with.
* @return True iff there is overlap
*/
- public boolean overlap(TimeInterval aInterval) {
-
- if ( isUncertain() || aInterval.isUncertain()) {
- // Optimistic assume there is no overlap if one of the intervals is uncertain.
- return false;
+ public boolean overlap(TimeInterval aInterval) {
+
+ if (isUncertain() || aInterval.isUncertain()) {
+ // Optimistic assume there is no overlap if one of the intervals is
+ // uncertain.
+ return false;
}
-
- if ( _end.asFloat() <= aInterval._begin.asFloat() ||
- aInterval._end.asFloat() <= _begin.asFloat() ) {
- return false;
+
+ if (_end.asFloat() <= aInterval._begin.asFloat()
+ || aInterval._end.asFloat() <= _begin.asFloat()) {
+ return false;
}
-
+
return true;
}
-
+
/**
- * Determines if the actual time that the program corresponds to is uncertain due to
- * the representation of a period of more than 24 hours using a 24 hour clock.
- * @return True iff the interval is uncertain.
+ * Determines if the actual time that the program corresponds to is
+ * uncertain due to the representation of a period of more than 24 hours
+ * using a 24 hour clock.
+ *
+ * @return True iff the interval is uncertain.
*/
- boolean isUncertain() {
- return _begin.asFloat() > _end.asFloat();
+ boolean isUncertain() {
+ return _begin.asFloat() > _end.asFloat();
}
}
package org.wamblee.crawler.kiss;
/**
- *
+ * Visitor of the TV guide.
*/
public interface Visitor {
+ /**
+ * Visits a program.
+ * @param aProgram Program.
+ */
void visitProgram(Program aProgram);
+ /**
+ * Visits a channel.
+ * @param aChannel Channel.
+ */
void visitChannel(Channel aChannel);
+ /**
+ * Visits the guide.
+ * @param aGuide Guide.
+ */
void visitTvGuide(TVGuide aGuide);
}