--- /dev/null
+<?xml version="1.0"?>
+
+<!DOCTYPE project [
+ <!ENTITY header SYSTEM "file:../build/header.xml">
+ <!ENTITY trailer SYSTEM "file:../build/trailer.xml">
+]>
+
+<project name="crawler" default="jar" basedir=".">
+
+
+ <!-- =============================================================================== -->
+ <!-- Include the build header defining general properties -->
+ <!-- =============================================================================== -->
+ <property name="project.home" value=".."/>
+ <property name="module.name" value="crawler" />
+
+ &header;
+
+ <target name="module.build.deps"
+ depends="logging.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d">
+ </target>
+
+ <!-- Set libraries to use in addition for test, a library which
+ is already mentioned in module.build.path should not be
+ mentioned below again -->
+ <target name="module.test.deps" depends="wamblee.support.test.d">
+ </target>
+
+ &trailer;
+
+
+</project>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<crawler>
+
+ <type>
+ <pattern>channels-favorites</pattern>
+ <xslt>channels-favorites.xsl</xslt>
+ </type>
+
+ <type>
+ <pattern>channel-overview</pattern>
+ <xslt>channel-overview.xsl</xslt>
+ </type>
+
+ <type>
+ <pattern>right-now</pattern>
+ <xslt>channel-right-now.xsl</xslt>
+ </type>
+
+ <type>
+ <pattern>program-info</pattern>
+ <xslt>program-info.xsl</xslt>
+ </type>
+
+
+ <url>
+ <pattern>http://epg.kml.kiss-technology.com/login_core.php</pattern>
+ <method>post</method>
+ <xslt>login.xsl</xslt>
+ <param name="user" value="bladibla"/>
+ <param name="passwd" value="abc123"/>
+ <param name="GMode" value="TextMode"/>
+ <param name="submit" value="Login"/>
+ </url>
+
+ <url>
+ <pattern>.*</pattern>
+ <method>get</method>
+ <xslt>identity.xsl</xslt>
+ </url>
+</crawler>
+
--- /dev/null
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpMethod;
+import org.apache.commons.httpclient.HttpStatus;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.apache.commons.httpclient.params.HttpClientParams;
+import org.apache.commons.httpclient.params.HttpMethodParams;
+import org.w3c.tidy.Tidy;
+
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ */
+public class Main {
+
+ private static final String BASE = "http://epg.kml.kiss-technology.com/";
+
+ private static int count = 0;
+
+ public static void main(String[] aArgs) {
+ HttpClientParams clientParams = new HttpClientParams();
+ clientParams.setIntParameter(HttpClientParams.MAX_REDIRECTS, 10);
+ clientParams.setBooleanParameter(HttpClientParams.REJECT_RELATIVE_REDIRECT, false);
+ HttpClient client = new HttpClient(clientParams);
+ client.getHostConfiguration().setProxy("localhost", 10000);
+
+ clientParams = client.getParams();
+ Object obj = clientParams.getParameter(HttpClientParams.MAX_REDIRECTS);
+ System.out.println("Max redirects = " + obj);
+ HttpMethod method = new GetMethod(BASE + "l.php");
+
+ executeMethod(client, method);
+
+ PostMethod postMethod = new PostMethod(BASE + "login_core.php");
+ HttpMethodParams params = new HttpMethodParams();
+ params.setParameter("user", "erik@brakkee.org");
+ params.setParameter("passwd", "ebra1969");
+ params.setParameter("SavePlayerID", "");
+ params.setParameter("GMode", "TextMode");
+ params.setParameter("submit", "Login");
+
+ NameValuePair[] data = new NameValuePair[] {
+ new NameValuePair("user", "erik@brakkee.org"),
+ new NameValuePair("passwd", "ebra1969"),
+ new NameValuePair("GMode", "TextMode"),
+ new NameValuePair("submit", "Login")
+ };
+ postMethod.addParameters(data);
+
+
+ executeMethod(client, postMethod);
+
+ Header header = postMethod.getResponseHeader("Location");
+ System.out.println("Redirecting to: " + header.getValue());
+ method = new GetMethod(header.getValue());
+ executeMethod(client, method);
+
+
+ }
+
+ /**
+ * @param client
+ * @param method
+ */
+ private static int executeMethod(HttpClient client, HttpMethod method) {
+ //method.setFollowRedirects(true);
+ try {
+ // Execute the method.
+ int statusCode = client.executeMethod(method);
+
+ if (statusCode != HttpStatus.SC_OK) {
+ System.err.println("Method failed: " + method.getStatusLine());
+ }
+
+ // Read the response body.
+ String filename = "output" + count++;
+ FileOutputStream os = new FileOutputStream(new File(filename));
+ //os.write(method.getResponseBody());
+
+ Tidy tidy = new Tidy();
+ tidy.setXHTML(true);
+ tidy.parse(method.getResponseBodyAsStream(), os);
+ os.close();
+ System.out.println("Written response to file: " + filename);
+ return statusCode;
+ } catch (HttpException e) {
+ throw new RuntimeException("Fatal protocol violation: " + e.getMessage());
+ } catch (IOException e) {
+ throw new RuntimeException("Fatal transport error: " + e.getMessage());
+ } finally {
+ // Release the connection.
+ method.releaseConnection();
+ }
+ }
+
+}
--- /dev/null
+
+############################################################################################
+# Default configuration file for log4j.
+#
+# This properties file is used if no other configuration if log4j is done explicitly.
+############################################################################################
+
+
+# Root logger reports everything and uses the console appender
+log4j.rootLogger=ERROR, console
+
+# Log level for wamblee.org
+log4j.logger.org.wamblee=DEBUG
+log4j.logger.org.wamblee.usermgt.UserAdministrationImplTest=INFO
+log4j.logger.org.wamblee.security.authorization=ERROR
+log4j.logger.org.wamblee.cache=INFO
+
+
+log4j.logger.org.springframework=ERROR
+log4j.logger.net.sf.ehcache=WARN
+
+# Default log level for hibernate
+log4j.logger.org.hibernate=ERROR
+log4j.logger.org.hibernate3=ERROR
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
+
+######################################################################################
+# Hibernate SQL logging, switch the log level to DEBUG to see the output.
+######################################################################################
+
+log4j.logger.org.wamblee.test.SpringTestCase=ERROR, console
+log4j.additivity.org.wamblee.test.SpringTestCase=false
+
+# Logging for queries.
+log4j.logger.org.hibernate.SQL=ERROR, sql
+log4j.additivity.org.hibernate.SQL=false
+
+# Logging for query parameters and return values.
+log4j.logger.org.hibernate.type=ERROR, sqltype
+log4j.additivity.org.hibernate.type=false
+
+# Appender for the queries
+log4j.appender.sql=org.apache.log4j.ConsoleAppender
+log4j.appender.sql.layout=org.apache.log4j.PatternLayout
+log4j.appender.sql.layout.ConversionPattern=%n%-4r [%t] SQL: %x - %m%n
+
+# Appender to show the actual parameters and return values of the queries.
+log4j.appender.sqltype=org.apache.log4j.ConsoleAppender
+log4j.appender.sqltype.layout=org.apache.log4j.PatternLayout
+log4j.appender.sqltype.layout.ConversionPattern=%-4r [%t] SQL: %x - %m%n
+
+
+
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpMethod;
+import org.apache.commons.httpclient.HttpStatus;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.w3c.dom.Document;
+import org.w3c.tidy.Tidy;
+import org.wamblee.xml.XSLT;
+
+/**
+ * General support claas for all kinds of requests.
+ */
+public abstract class AbstractPageRequest implements PageRequest {
+
+ private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class);
+ private static final String REDIRECT_HEADER = "Location";
+
+ private NameValuePair[] _params;
+
+ private String _xslt;
+
+ private PrintStream _os;
+
+ protected AbstractPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+ if ( aParams == null ) {
+ throw new IllegalArgumentException("aParams is null");
+ }
+ if ( aXslt == null ) {
+ throw new IllegalArgumentException("aXslt is null");
+ }
+ _params = aParams;
+ _xslt = aXslt;
+ _os = aOs;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.PageRequest#overrideXslt(java.lang.String)
+ */
+ public void overrideXslt(String aXslt) {
+ _xslt = aXslt;
+ }
+
+ protected NameValuePair[] getParameters() {
+ return _params;
+ }
+
+ protected Document executeMethod(HttpClient client, HttpMethod method) {
+ try {
+ // Execute the method.
+ method = executeWithRedirects(client, method);
+
+ // Transform the HTML into wellformed XML.
+ Tidy tidy = new Tidy();
+ tidy.setXHTML(true);
+ tidy.setQuiet(true);
+ tidy.setShowWarnings(false);
+ if ( _os != null ) {
+ _os.println("Content of '" + method.getURI() + "'");
+ _os.println();
+ }
+ // We let jtidy produce raw output because the DOM it produces is
+ // is not namespace aware. We let the XSLT processor parse the XML again
+ // to ensure that the XSLT uses a namespace aware DOM tree. An alternative
+ // is to configure namespace awareness of the XML parser in a system wide way.
+ ByteArrayOutputStream xhtml = new ByteArrayOutputStream();
+ tidy.parse(method.getResponseBodyAsStream(), xhtml);
+ _os.print(new String(xhtml.toByteArray()));
+ // Obtaining the XML as dom is not used.
+ //Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(),
+ // _os);
+ if ( _os != null ) {
+ _os.println();
+ }
+ xhtml.flush();
+ byte[] xhtmlData = xhtml.toByteArray();
+ Document transformed = XSLT.transform(xhtmlData, new File(_xslt));
+ _os.println("Transformed result is: ");
+ Transformer transformer = TransformerFactory.newInstance().newTransformer();
+ transformer.setParameter(OutputKeys.INDENT, "yes");
+ transformer.setParameter(OutputKeys.METHOD, "xml");
+ transformer.transform(new DOMSource(transformed), new StreamResult(_os));
+
+ return transformed;
+ } catch (Exception e) {
+ throw new RuntimeException(e.getMessage(), e);
+ } finally {
+ // Release the connection.
+ method.releaseConnection();
+ }
+ }
+
+ /**
+ * @param aClient
+ * @param aMethod
+ * @throws IOException
+ * @throws HttpException
+ */
+ private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException {
+ int statusCode = aClient.executeMethod(aMethod);
+
+ switch (statusCode) {
+ case HttpStatus.SC_OK: {
+ return aMethod;
+ }
+ case HttpStatus.SC_MOVED_PERMANENTLY:
+ case HttpStatus.SC_MOVED_TEMPORARILY:
+ case HttpStatus.SC_SEE_OTHER: {
+ aMethod.releaseConnection();
+ Header header = aMethod.getResponseHeader(REDIRECT_HEADER);
+ aMethod = new GetMethod(header.getValue());
+ return executeWithRedirects(aClient, aMethod); // TODO protect against infinite recursion.
+ }
+ default: {
+ throw new RuntimeException("Method failed: "
+ + aMethod.getStatusLine());
+ }
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+import org.dom4j.Element;
+
+/**
+ * An action defined on a page.
+ */
+public interface Action {
+
+ /**
+ * The name of the action.
+ * @return Action name.
+ */
+ String getName();
+
+ /**
+ * Executes the action.
+ * @return
+ */
+ Page execute();
+
+ /**
+ * Gets a description of the action. THe element returned is the action element
+ * itself.
+ * @return
+ */
+ Element getContent();
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+/**
+ * Configuration which determines how a specific page must be retrieved and
+ * what transformation should be applied to it.
+ */
+public interface Configuration {
+
+ /**
+ * Gets the page request based on the URL.
+ * @param aUrl
+ * @return Page request.
+ */
+ PageRequest getRequest(String aUrl);
+
+ /**
+ * Gets the page request based on the type of the page instead
+ * of on the URL.
+ * @param aType Type of page.
+ * @return Page request.
+ */
+ PageRequest getRequest(PageType aType);
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+
+/**
+ * Represents a crawled page.
+ */
+public interface Crawler {
+
+ /**
+ * Gets the content for a specific page.
+ * @param aUrl Url of page.
+ * @return Page to retrieve.
+ */
+ Page getPage(String aUrl);
+
+ /**
+ * Gets the content for a specific page.
+ * @param aUrl Url of page.
+ * @param aType Type of page.
+ * @return Page.
+ */
+ Page getPage(String aUrl, PageType aType);
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+import java.io.PrintStream;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.HttpMethod;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.w3c.dom.Document;
+
+/**
+ * Gets a page by issueing a get request.
+ */
+public class GetPageRequest extends AbstractPageRequest {
+
+ public GetPageRequest(NameValuePair[] aParams, String aXslt) {
+ super(aParams, aXslt, null);
+ }
+
+ public GetPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+ super(aParams, aXslt, aOs);
+ }
+
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.PageRequest#getPage(org.apache.commons.httpclient.HttpClient)
+ */
+ public Document execute(String aUrl, HttpClient aClient) {
+ HttpMethod method = new GetMethod(aUrl);
+ if ( getParameters().length > 0 ) {
+ String oldQueryString = method.getQueryString();
+ method.setQueryString(getParameters());
+ String queryString = method.getQueryString();
+ if ( oldQueryString.length() > 0 ) {
+ queryString = queryString + '&' + oldQueryString;
+ method.setQueryString(queryString);
+ }
+ }
+
+ return executeMethod(aClient, method);
+ }
+
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+import org.dom4j.Element;
+
+/**
+ * Represents a retrieved page.
+ */
+public interface Page {
+
+ /**
+ * Gets the content of the page as raw XML.
+ * @return Page content.
+ */
+ Element getContent();
+
+ /**
+ * Obtains the links available on the page.
+ * @return Link names.
+ */
+ Action[] getActions();
+
+ /**
+ * Gets the named action. Only works if the action name is unique.
+ * @param aName Name of the action.
+ * @return Action object.
+ */
+ Action getAction(String aName);
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.w3c.dom.Document;
+
+/**
+ * Represents a specific request to obtain and transform a page.
+ */
+public interface PageRequest {
+
+ /**
+ * Gets a page as an XML document.
+ * @param aClient Http client to use.
+ * @return Client.
+ */
+ Document execute(String aUrl, HttpClient aClient);
+
+ /**
+ * Overrides the Xslt to use.
+ * @param aXslt Xslt to use.
+ */
+ void overrideXslt(String aXslt);
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+/**
+ *
+ */
+public class PageType {
+
+ private String _type;
+
+ public PageType(String aType) {
+ _type = aType;
+ }
+
+ public String getType() {
+ return _type;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "PageType(type='" + _type + "')";
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+import java.io.PrintStream;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.w3c.dom.Document;
+
+/**
+ * Retrieving pages using the post method.
+ */
+public class PostPageRequest extends AbstractPageRequest {
+
+ public PostPageRequest(NameValuePair[] aParams, String aXslt) {
+ super(aParams, aXslt, null);
+ }
+
+ public PostPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+ super(aParams, aXslt, aOs);
+ }
+
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.PageRequest#execute(java.lang.String, org.apache.commons.httpclient.HttpClient)
+ */
+ public Document execute(String aUrl, HttpClient aClient) {
+ PostMethod method = new PostMethod(aUrl);
+ method.addParameters(getParameters());
+ return executeMethod(aClient, method);
+ }
+
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import org.dom4j.Element;
+import org.wamblee.crawler.Action;
+import org.wamblee.crawler.Crawler;
+import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageType;
+
+/**
+ *
+ */
+public class ActionImpl implements Action {
+
+ private Crawler _crawler;
+ private Element _content;
+ private String _name;
+ private String _reference;
+ private PageType _type;
+
+ public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference) {
+ _crawler = aCrawler;
+ _content = aContent;
+ _name = aName;
+ _reference = aReference;
+ _type = null;
+ }
+
+ public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference, PageType aType) {
+ _crawler = aCrawler;
+ _content = aContent;
+ _name = aName;
+ _reference = aReference;
+ _type = aType;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.Action#getName()
+ */
+ public String getName() {
+ return _name;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.Action#execute()
+ */
+ public Page execute() {
+ if ( _type == null) {
+ return _crawler.getPage(_reference);
+ }
+ return _crawler.getPage(_reference, _type);
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.Action#getContent()
+ */
+ public Element getContent() {
+ return _content;
+ }
+}
--- /dev/null
+package org.wamblee.crawler.impl;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.PrintStream;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.dom4j.Element;
+import org.wamblee.crawler.Action;
+import org.wamblee.crawler.Configuration;
+import org.wamblee.crawler.Crawler;
+import org.wamblee.crawler.Page;
+
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Entry point for the crawler.
+ */
+public class App {
+
+ private static final String LOG_FILE = "crawler.log";
+
+ public static void main(String[] args) throws Exception {
+ String configFileName = args[0];
+ String starturl = args[1];
+
+ FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
+ PrintStream os = new PrintStream(fos);
+
+ try {
+ ConfigurationParser parser = new ConfigurationParser(os);
+ InputStream configFile = new FileInputStream(new File(
+ configFileName));
+ Configuration config = parser.parse(configFile);
+
+ HttpClient client = new HttpClient();
+ // client.getHostConfiguration().setProxy("localhost", 3128);
+
+ Crawler crawler = new CrawlerImpl(client, config);
+
+ System.out.println("Retrieving: " + starturl);
+ Page page = crawler.getPage(starturl);
+ showPage(page);
+ page = page.getAction("channels-favorites").execute();
+ recordInterestingShows(page);
+ showPage(page);
+ page = page.getAction("Nederland 1").execute();
+ showPage(page);
+ page = page.getAction("right-now").execute();
+ showPage(page);
+ page = page.getAction("Het elfde uur").execute();
+ showPage(page);
+ } finally {
+ os.flush();
+ os.close();
+ System.out.println("Output written on '" + LOG_FILE + "'");
+ }
+ }
+
+ /**
+ * @param starturl
+ * @param crawler
+ */
+ private static void showPage(Page aPage) {
+ Action[] links = aPage.getActions();
+ for (Action link: links) {
+ System.out.println("Link found '" + link.getName() + "'");
+ }
+ Element element = aPage.getContent();
+ System.out.println("Retrieved content: " + element.asXML());
+ }
+
+ private static void recordInterestingShows(Page page) {
+ Action[] channels = page.getActions();
+ for (Action channel: channels) {
+ examineChannel(channel.getName(), channel.execute().getAction("right-now").execute());
+ }
+ }
+
+ private static void examineChannel(String aChannel, Page aPage) {
+ Action[] programs = aPage.getActions();
+ for (Action program: programs) {
+ System.out.println(aChannel + " - " + program.getName());
+ if ( program.getName().toLowerCase().matches(".*babe.*")) {
+ Page programPage = program.execute();
+ Action record = programPage.getAction("record");
+ System.out.println("Recording possible: " + record != null);
+ }
+ }
+ }
+
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import java.util.regex.Pattern;
+
+/**
+ *
+ */
+class ConfigItem<ValueType> {
+
+ private Pattern _pattern;
+ private ValueType _value;
+
+ protected ConfigItem(String aPattern, ValueType aValue) {
+ _pattern = Pattern.compile(aPattern);
+ _value = aValue;
+ }
+
+ protected ValueType match(String aValue) {
+ if ( !_pattern.matcher(aValue).matches() ) {
+ return null;
+ }
+ return _value;
+ }
+
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import java.util.List;
+
+import org.wamblee.crawler.Configuration;
+import org.wamblee.crawler.PageRequest;
+import org.wamblee.crawler.PageType;
+
+
+/**
+ * Implementation of the configuration for the crawler.
+ */
+public class ConfigurationImpl implements Configuration {
+
+ private List<UrlConfig> _urlConfig;
+ private List<PageTypeConfig> _pageTypeConfig;
+
+ public ConfigurationImpl(List<UrlConfig> aUrlConfig, List<PageTypeConfig> aPageTypeConfig) {
+ _urlConfig = aUrlConfig;
+ _pageTypeConfig = aPageTypeConfig;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.Configuration#getRequest(java.lang.String)
+ */
+ public PageRequest getRequest(String aUrl) {
+
+ for (UrlConfig config: _urlConfig) {
+ PageRequest request = config.getRequest(aUrl);
+ if ( request != null ) {
+ return request;
+ }
+ }
+ throw new RuntimeException("No configuration matched the URL '" + aUrl + "'");
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.Configuration#getRequest(org.wamblee.crawler.PageType)
+ */
+ public PageRequest getRequest(PageType aType) {
+ for (PageTypeConfig config: _pageTypeConfig) {
+ PageRequest request = config.getRequest(aType.getType());
+ if ( request != null ) {
+ return request;
+ }
+ }
+ throw new RuntimeException("No configuration matched type '" + aType + "'");
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.commons.httpclient.NameValuePair;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Element;
+import org.dom4j.io.SAXReader;
+import org.wamblee.crawler.Configuration;
+import org.wamblee.crawler.GetPageRequest;
+import org.wamblee.crawler.PageRequest;
+import org.wamblee.crawler.PostPageRequest;
+
+/**
+ * Parsing of the configuration from an XML file.
+ */
+public class ConfigurationParser {
+
+ private static final String ELEM_URL = "url";
+ private static final String ELEM_TYPE = "type";
+ private static final String ELEM_PATTERN = "pattern";
+ private static final String ELEM_METHOD= "method";
+ private static final String ELEM_XSLT = "xslt";
+ private static final String ELEM_PARAM = "param";
+ private static final String AT_NAME = "name";
+ private static final String AT_VALUE = "value";
+
+ private static final String METHOD_POST = "post";
+ private static final String METHOD_GET = "get";
+
+ private PrintStream _os;
+
+ public ConfigurationParser(PrintStream aOs) {
+ _os = aOs;
+ }
+
+ public Configuration parse(InputStream aStream) {
+ try {
+ SAXReader reader = new SAXReader();
+ Document document = reader.read(aStream);
+
+ Element root = document.getRootElement();
+ List<UrlConfig> urlConfigs = parseUrlConfigs(root);
+ List<PageTypeConfig> pageTypeConfigs = parsePageTypeConfigs(root);
+ return new ConfigurationImpl(urlConfigs, pageTypeConfigs);
+ } catch (DocumentException e) {
+ throw new RuntimeException("Problem parsing config file", e);
+ }
+ }
+
+ /**
+ * @param root
+ * @return
+ */
+ private List<UrlConfig> parseUrlConfigs(Element root) {
+ List<UrlConfig> configs = new ArrayList<UrlConfig>();
+ for (Iterator i = root.elementIterator(ELEM_URL); i.hasNext(); ) {
+ Element url = (Element)i.next();
+ UrlConfig config = parseUrlConfig(url);
+ configs.add(config);
+ }
+ return configs;
+ }
+
+ private List<PageTypeConfig> parsePageTypeConfigs(Element root) {
+ List<PageTypeConfig> configs = new ArrayList<PageTypeConfig>();
+ for (Iterator i = root.elementIterator(ELEM_TYPE); i.hasNext(); ) {
+ Element url = (Element)i.next();
+ PageTypeConfig config = parsePageTypeConfig(url);
+ configs.add(config);
+ }
+ return configs;
+ }
+
+ private UrlConfig parseUrlConfig(Element aUrlElem) {
+ String pattern = aUrlElem.elementText(ELEM_PATTERN);
+ PageRequest request = parseRequestConfig(aUrlElem);
+ return new UrlConfig(pattern, request);
+ }
+
+ private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
+ String pattern = aTypeElem.elementText(ELEM_PATTERN);
+ PageRequest request = parseRequestConfig(aTypeElem);
+ return new PageTypeConfig(pattern, request);
+ }
+
+ /**
+ * @param aUrlElem
+ * @return
+ */
+ private PageRequest parseRequestConfig(Element aUrlElem) {
+ String method = aUrlElem.elementText(ELEM_METHOD);
+ String xslt = aUrlElem.elementText(ELEM_XSLT);
+ List<NameValuePair> params = new ArrayList<NameValuePair>();
+ for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext(); ) {
+ Element paramElem = (Element)i.next();
+ NameValuePair param = parseParameter(paramElem);
+ params.add(param);
+ }
+
+ NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]);
+ PageRequest request;
+ if ( METHOD_POST.equals(method)) {
+ request = new PostPageRequest(paramsArray, xslt, _os);
+ }
+ else if ( METHOD_GET.equals(method) || method == null ){
+ request = new GetPageRequest(paramsArray, xslt, _os);
+ } else {
+ throw new RuntimeException("Unknown request method '" + method + "'. Only " +
+ METHOD_GET + " and " + METHOD_POST + " are supported");
+ }
+ return request;
+ }
+
+ private NameValuePair parseParameter(Element aParam) {
+ String name = aParam.attributeValue(AT_NAME);
+ String value = aParam.attributeValue(AT_VALUE);
+ return new NameValuePair(name, value);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Element;
+import org.dom4j.io.DOMReader;
+import org.w3c.dom.Document;
+import org.wamblee.crawler.Configuration;
+import org.wamblee.crawler.Crawler;
+import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageRequest;
+import org.wamblee.crawler.PageType;
+
+/**
+ * Crawler implementation.
+ */
+public class CrawlerImpl implements Crawler {
+
+ private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
+
+ private HttpClient _client;
+ private Configuration _config;
+
+ public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
+ _client = aClient;
+ _config = aConfig;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
+ */
+ public Page getPage(String aUrl) {
+ LOG.info("Getting page: url = '" + aUrl + "'");
+ PageRequest request = _config.getRequest(aUrl);
+ Document content = request.execute(aUrl, _client);
+ return transformToDom4jDoc(content);
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
+ */
+ public Page getPage(String aUrl, PageType aType) {
+ LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
+ PageRequest request = _config.getRequest(aType);
+ Document content = request.execute(aUrl, _client);
+ return transformToDom4jDoc(content);
+ }
+
+ /**
+ * @param aUrl
+ * @param request
+ * @return
+ */
+ private Page transformToDom4jDoc(Document content) {
+
+ DOMReader reader = new DOMReader();
+ org.dom4j.Document dom4jDoc = reader.read(content);
+ Element root = dom4jDoc.getRootElement();
+ dom4jDoc.remove(root);
+
+ return new PageImpl(this, replaceReferencesWithContent(root));
+ }
+
+ /**
+ * Perform crawling. Find references in the retrieved content and replace them
+ * by the content they refer to by retrieving the appropriate pages as well.
+ * @param content Content which must be made complete.
+ * @return Fully processed content.
+ */
+ private Element replaceReferencesWithContent(Element content) {
+ return content; // TODO implement.
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.dom4j.DocumentHelper;
+import org.dom4j.Element;
+import org.dom4j.XPath;
+import org.wamblee.crawler.Action;
+import org.wamblee.crawler.Crawler;
+import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageType;
+
+/**
+ * Page implementation.
+ */
+public class PageImpl implements Page {
+
+ private static final String ELEM_NAME = "action";
+
+ private static final String ATT_NAME = "name";
+
+ private static final String ATT_HREF = "reference";
+
+ private static final String ATT_TYPE = "type";
+
+ private Crawler _crawler;
+
+ private Element _content;
+
+ private Action[] _actions;
+
+ /**
+ * Constructs a page.
+ *
+ * @param aContent
+ */
+ public PageImpl(Crawler aCrawler, Element aContent) {
+ _crawler = aCrawler;
+ _content = aContent;
+ _actions = computeActions();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.wamblee.crawler.Page#getLinkNames()
+ */
+ private Action[] computeActions() {
+ XPath xpath = DocumentHelper.createXPath(ELEM_NAME);
+ List<Element> results = (List<Element>) xpath.selectNodes(_content);
+ List<Action> names = new ArrayList<Action>();
+ for (Element elem : results) {
+ String name = elem.attributeValue(ATT_NAME);
+ String href = elem.attributeValue(ATT_HREF);
+ String type = elem.attributeValue(ATT_TYPE);
+ if (type == null ) {
+ names.add(new ActionImpl(_crawler, elem, name, href));
+ }
+ else {
+ names.add(new ActionImpl(_crawler, elem, name, href, new PageType(type)));
+ }
+ }
+ return names.toArray(new Action[0]);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.wamblee.crawler.Page#getContent()
+ */
+ public Element getContent() {
+ return _content;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.Page#getActions()
+ */
+ public Action[] getActions() {
+ return _actions;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.wamblee.crawler.Page#getAction(java.lang.String)
+ */
+ public Action getAction(String aName) {
+ List<Action> results = new ArrayList<Action>();
+ for (Action action: _actions) {
+ if ( action.getName().equals(aName)) {
+ results.add(action);
+ }
+ }
+ if (results.size() == 0) {
+ return null;
+ }
+ if (results.size() > 1) {
+ throw new RuntimeException("Duplicate link '" + aName + "'");
+ }
+ return results.get(0);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import org.wamblee.crawler.PageRequest;
+
+/**
+ *
+ */
+public class PageTypeConfig extends ConfigItem<PageRequest> {
+
+ public PageTypeConfig(String aPattern, PageRequest aRequest) {
+ super(aPattern, aRequest);
+ }
+
+ public PageRequest getRequest(String aType) {
+ return match(aType);
+ }
+}
--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import org.wamblee.crawler.PageRequest;
+
+/**
+ * Represents the configuration for specific URLs.
+ */
+public class UrlConfig extends ConfigItem<PageRequest> {
+ /**
+ * Constructs the information for how to perform a request for a specific
+ * URL.
+ *
+ * @param aPattern
+ * Pattern that the URL must match.
+ * @param aRequest
+ * Request that must be executed to retrieve the URL.
+ */
+ public UrlConfig(String aPattern, PageRequest aRequest) {
+ super(aPattern, aRequest);
+ }
+
+ /**
+ * Gets the request to execute.
+ *
+ * @return Request, or null if the URL does not match.
+ */
+ public PageRequest getRequest(String aUrl) {
+ return match(aUrl);
+ }
+}