From: Erik Brakkee Date: Tue, 14 Mar 2006 20:39:10 +0000 (+0000) Subject: (no commit message) X-Git-Tag: wamblee-utils-0.7~1163 X-Git-Url: http://wamblee.org/gitweb/?a=commitdiff_plain;h=fdcf105c3645f4a5b70c2e78527a5a5b00d1c41e;p=utils --- diff --git a/crawler/build.xml b/crawler/build.xml new file mode 100644 index 00000000..5f7ee859 --- /dev/null +++ b/crawler/build.xml @@ -0,0 +1,32 @@ + + + + +]> + + + + + + + + + + + &header; + + + + + + + + + &trailer; + + + diff --git a/crawler/config.xml b/crawler/config.xml new file mode 100644 index 00000000..21531c25 --- /dev/null +++ b/crawler/config.xml @@ -0,0 +1,41 @@ + + + + + channels-favorites + channels-favorites.xsl + + + + channel-overview + channel-overview.xsl + + + + right-now + channel-right-now.xsl + + + + program-info + program-info.xsl + + + + + http://epg.kml.kiss-technology.com/login_core.php + post + login.xsl + + + + + + + + .* + get + identity.xsl + + + diff --git a/crawler/src/Main.java b/crawler/src/Main.java new file mode 100644 index 00000000..8287a6ec --- /dev/null +++ b/crawler/src/Main.java @@ -0,0 +1,118 @@ +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.methods.PostMethod; +import org.apache.commons.httpclient.params.HttpClientParams; +import org.apache.commons.httpclient.params.HttpMethodParams; +import org.w3c.tidy.Tidy; + +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + */ +public class Main { + + private static final String BASE = "http://epg.kml.kiss-technology.com/"; + + private static int count = 0; + + public static void main(String[] aArgs) { + HttpClientParams clientParams = new HttpClientParams(); + clientParams.setIntParameter(HttpClientParams.MAX_REDIRECTS, 10); + clientParams.setBooleanParameter(HttpClientParams.REJECT_RELATIVE_REDIRECT, false); + HttpClient client = new HttpClient(clientParams); + client.getHostConfiguration().setProxy("localhost", 10000); + + clientParams = client.getParams(); + Object obj = clientParams.getParameter(HttpClientParams.MAX_REDIRECTS); + System.out.println("Max redirects = " + obj); + HttpMethod method = new GetMethod(BASE + "l.php"); + + executeMethod(client, method); + + PostMethod postMethod = new PostMethod(BASE + "login_core.php"); + HttpMethodParams params = new HttpMethodParams(); + params.setParameter("user", "erik@brakkee.org"); + params.setParameter("passwd", "ebra1969"); + params.setParameter("SavePlayerID", ""); + params.setParameter("GMode", "TextMode"); + params.setParameter("submit", "Login"); + + NameValuePair[] data = new NameValuePair[] { + new NameValuePair("user", "erik@brakkee.org"), + new NameValuePair("passwd", "ebra1969"), + new NameValuePair("GMode", "TextMode"), + new NameValuePair("submit", "Login") + }; + postMethod.addParameters(data); + + + executeMethod(client, postMethod); + + Header header = postMethod.getResponseHeader("Location"); + System.out.println("Redirecting to: " + header.getValue()); + method = new GetMethod(header.getValue()); + executeMethod(client, method); + + + } + + /** + * @param client + * @param method + */ + private static int executeMethod(HttpClient client, HttpMethod method) { + //method.setFollowRedirects(true); + try { + // Execute the method. + int statusCode = client.executeMethod(method); + + if (statusCode != HttpStatus.SC_OK) { + System.err.println("Method failed: " + method.getStatusLine()); + } + + // Read the response body. + String filename = "output" + count++; + FileOutputStream os = new FileOutputStream(new File(filename)); + //os.write(method.getResponseBody()); + + Tidy tidy = new Tidy(); + tidy.setXHTML(true); + tidy.parse(method.getResponseBodyAsStream(), os); + os.close(); + System.out.println("Written response to file: " + filename); + return statusCode; + } catch (HttpException e) { + throw new RuntimeException("Fatal protocol violation: " + e.getMessage()); + } catch (IOException e) { + throw new RuntimeException("Fatal transport error: " + e.getMessage()); + } finally { + // Release the connection. + method.releaseConnection(); + } + } + +} diff --git a/crawler/src/log4j.properties b/crawler/src/log4j.properties new file mode 100644 index 00000000..ab710b36 --- /dev/null +++ b/crawler/src/log4j.properties @@ -0,0 +1,56 @@ + +############################################################################################ +# Default configuration file for log4j. +# +# This properties file is used if no other configuration if log4j is done explicitly. +############################################################################################ + + +# Root logger reports everything and uses the console appender +log4j.rootLogger=ERROR, console + +# Log level for wamblee.org +log4j.logger.org.wamblee=DEBUG +log4j.logger.org.wamblee.usermgt.UserAdministrationImplTest=INFO +log4j.logger.org.wamblee.security.authorization=ERROR +log4j.logger.org.wamblee.cache=INFO + + +log4j.logger.org.springframework=ERROR +log4j.logger.net.sf.ehcache=WARN + +# Default log level for hibernate +log4j.logger.org.hibernate=ERROR +log4j.logger.org.hibernate3=ERROR + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n + +###################################################################################### +# Hibernate SQL logging, switch the log level to DEBUG to see the output. +###################################################################################### + +log4j.logger.org.wamblee.test.SpringTestCase=ERROR, console +log4j.additivity.org.wamblee.test.SpringTestCase=false + +# Logging for queries. +log4j.logger.org.hibernate.SQL=ERROR, sql +log4j.additivity.org.hibernate.SQL=false + +# Logging for query parameters and return values. +log4j.logger.org.hibernate.type=ERROR, sqltype +log4j.additivity.org.hibernate.type=false + +# Appender for the queries +log4j.appender.sql=org.apache.log4j.ConsoleAppender +log4j.appender.sql.layout=org.apache.log4j.PatternLayout +log4j.appender.sql.layout.ConversionPattern=%n%-4r [%t] SQL: %x - %m%n + +# Appender to show the actual parameters and return values of the queries. +log4j.appender.sqltype=org.apache.log4j.ConsoleAppender +log4j.appender.sqltype.layout=org.apache.log4j.PatternLayout +log4j.appender.sqltype.layout.ConversionPattern=%-4r [%t] SQL: %x - %m%n + + + diff --git a/crawler/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/src/org/wamblee/crawler/AbstractPageRequest.java new file mode 100644 index 00000000..63764f53 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/AbstractPageRequest.java @@ -0,0 +1,152 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.w3c.dom.Document; +import org.w3c.tidy.Tidy; +import org.wamblee.xml.XSLT; + +/** + * General support claas for all kinds of requests. + */ +public abstract class AbstractPageRequest implements PageRequest { + + private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class); + private static final String REDIRECT_HEADER = "Location"; + + private NameValuePair[] _params; + + private String _xslt; + + private PrintStream _os; + + protected AbstractPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { + if ( aParams == null ) { + throw new IllegalArgumentException("aParams is null"); + } + if ( aXslt == null ) { + throw new IllegalArgumentException("aXslt is null"); + } + _params = aParams; + _xslt = aXslt; + _os = aOs; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.PageRequest#overrideXslt(java.lang.String) + */ + public void overrideXslt(String aXslt) { + _xslt = aXslt; + } + + protected NameValuePair[] getParameters() { + return _params; + } + + protected Document executeMethod(HttpClient client, HttpMethod method) { + try { + // Execute the method. + method = executeWithRedirects(client, method); + + // Transform the HTML into wellformed XML. + Tidy tidy = new Tidy(); + tidy.setXHTML(true); + tidy.setQuiet(true); + tidy.setShowWarnings(false); + if ( _os != null ) { + _os.println("Content of '" + method.getURI() + "'"); + _os.println(); + } + // We let jtidy produce raw output because the DOM it produces is + // is not namespace aware. We let the XSLT processor parse the XML again + // to ensure that the XSLT uses a namespace aware DOM tree. An alternative + // is to configure namespace awareness of the XML parser in a system wide way. + ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); + tidy.parse(method.getResponseBodyAsStream(), xhtml); + _os.print(new String(xhtml.toByteArray())); + // Obtaining the XML as dom is not used. + //Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(), + // _os); + if ( _os != null ) { + _os.println(); + } + xhtml.flush(); + byte[] xhtmlData = xhtml.toByteArray(); + Document transformed = XSLT.transform(xhtmlData, new File(_xslt)); + _os.println("Transformed result is: "); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setParameter(OutputKeys.INDENT, "yes"); + transformer.setParameter(OutputKeys.METHOD, "xml"); + transformer.transform(new DOMSource(transformed), new StreamResult(_os)); + + return transformed; + } catch (Exception e) { + throw new RuntimeException(e.getMessage(), e); + } finally { + // Release the connection. + method.releaseConnection(); + } + } + + /** + * @param aClient + * @param aMethod + * @throws IOException + * @throws HttpException + */ + private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException { + int statusCode = aClient.executeMethod(aMethod); + + switch (statusCode) { + case HttpStatus.SC_OK: { + return aMethod; + } + case HttpStatus.SC_MOVED_PERMANENTLY: + case HttpStatus.SC_MOVED_TEMPORARILY: + case HttpStatus.SC_SEE_OTHER: { + aMethod.releaseConnection(); + Header header = aMethod.getResponseHeader(REDIRECT_HEADER); + aMethod = new GetMethod(header.getValue()); + return executeWithRedirects(aClient, aMethod); // TODO protect against infinite recursion. + } + default: { + throw new RuntimeException("Method failed: " + + aMethod.getStatusLine()); + } + } + } +} diff --git a/crawler/src/org/wamblee/crawler/Action.java b/crawler/src/org/wamblee/crawler/Action.java new file mode 100644 index 00000000..a4df7a1f --- /dev/null +++ b/crawler/src/org/wamblee/crawler/Action.java @@ -0,0 +1,44 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +import org.dom4j.Element; + +/** + * An action defined on a page. + */ +public interface Action { + + /** + * The name of the action. + * @return Action name. + */ + String getName(); + + /** + * Executes the action. + * @return + */ + Page execute(); + + /** + * Gets a description of the action. THe element returned is the action element + * itself. + * @return + */ + Element getContent(); +} diff --git a/crawler/src/org/wamblee/crawler/Configuration.java b/crawler/src/org/wamblee/crawler/Configuration.java new file mode 100644 index 00000000..662182ca --- /dev/null +++ b/crawler/src/org/wamblee/crawler/Configuration.java @@ -0,0 +1,39 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +/** + * Configuration which determines how a specific page must be retrieved and + * what transformation should be applied to it. + */ +public interface Configuration { + + /** + * Gets the page request based on the URL. + * @param aUrl + * @return Page request. + */ + PageRequest getRequest(String aUrl); + + /** + * Gets the page request based on the type of the page instead + * of on the URL. + * @param aType Type of page. + * @return Page request. + */ + PageRequest getRequest(PageType aType); +} diff --git a/crawler/src/org/wamblee/crawler/Crawler.java b/crawler/src/org/wamblee/crawler/Crawler.java new file mode 100644 index 00000000..f55eebb3 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/Crawler.java @@ -0,0 +1,39 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + + +/** + * Represents a crawled page. + */ +public interface Crawler { + + /** + * Gets the content for a specific page. + * @param aUrl Url of page. + * @return Page to retrieve. + */ + Page getPage(String aUrl); + + /** + * Gets the content for a specific page. + * @param aUrl Url of page. + * @param aType Type of page. + * @return Page. + */ + Page getPage(String aUrl, PageType aType); +} diff --git a/crawler/src/org/wamblee/crawler/GetPageRequest.java b/crawler/src/org/wamblee/crawler/GetPageRequest.java new file mode 100644 index 00000000..7d99c1e8 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/GetPageRequest.java @@ -0,0 +1,59 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +import java.io.PrintStream; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.methods.GetMethod; +import org.w3c.dom.Document; + +/** + * Gets a page by issueing a get request. + */ +public class GetPageRequest extends AbstractPageRequest { + + public GetPageRequest(NameValuePair[] aParams, String aXslt) { + super(aParams, aXslt, null); + } + + public GetPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { + super(aParams, aXslt, aOs); + } + + + /* (non-Javadoc) + * @see org.wamblee.crawler.PageRequest#getPage(org.apache.commons.httpclient.HttpClient) + */ + public Document execute(String aUrl, HttpClient aClient) { + HttpMethod method = new GetMethod(aUrl); + if ( getParameters().length > 0 ) { + String oldQueryString = method.getQueryString(); + method.setQueryString(getParameters()); + String queryString = method.getQueryString(); + if ( oldQueryString.length() > 0 ) { + queryString = queryString + '&' + oldQueryString; + method.setQueryString(queryString); + } + } + + return executeMethod(aClient, method); + } + +} diff --git a/crawler/src/org/wamblee/crawler/Page.java b/crawler/src/org/wamblee/crawler/Page.java new file mode 100644 index 00000000..18cf37a6 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/Page.java @@ -0,0 +1,44 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +import org.dom4j.Element; + +/** + * Represents a retrieved page. + */ +public interface Page { + + /** + * Gets the content of the page as raw XML. + * @return Page content. + */ + Element getContent(); + + /** + * Obtains the links available on the page. + * @return Link names. + */ + Action[] getActions(); + + /** + * Gets the named action. Only works if the action name is unique. + * @param aName Name of the action. + * @return Action object. + */ + Action getAction(String aName); +} diff --git a/crawler/src/org/wamblee/crawler/PageRequest.java b/crawler/src/org/wamblee/crawler/PageRequest.java new file mode 100644 index 00000000..cf88bbf8 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/PageRequest.java @@ -0,0 +1,39 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +import org.apache.commons.httpclient.HttpClient; +import org.w3c.dom.Document; + +/** + * Represents a specific request to obtain and transform a page. + */ +public interface PageRequest { + + /** + * Gets a page as an XML document. + * @param aClient Http client to use. + * @return Client. + */ + Document execute(String aUrl, HttpClient aClient); + + /** + * Overrides the Xslt to use. + * @param aXslt Xslt to use. + */ + void overrideXslt(String aXslt); +} diff --git a/crawler/src/org/wamblee/crawler/PageType.java b/crawler/src/org/wamblee/crawler/PageType.java new file mode 100644 index 00000000..9d2af30b --- /dev/null +++ b/crawler/src/org/wamblee/crawler/PageType.java @@ -0,0 +1,41 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +/** + * + */ +public class PageType { + + private String _type; + + public PageType(String aType) { + _type = aType; + } + + public String getType() { + return _type; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "PageType(type='" + _type + "')"; + } +} diff --git a/crawler/src/org/wamblee/crawler/PostPageRequest.java b/crawler/src/org/wamblee/crawler/PostPageRequest.java new file mode 100644 index 00000000..10ad783a --- /dev/null +++ b/crawler/src/org/wamblee/crawler/PostPageRequest.java @@ -0,0 +1,49 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler; + +import java.io.PrintStream; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.methods.PostMethod; +import org.w3c.dom.Document; + +/** + * Retrieving pages using the post method. + */ +public class PostPageRequest extends AbstractPageRequest { + + public PostPageRequest(NameValuePair[] aParams, String aXslt) { + super(aParams, aXslt, null); + } + + public PostPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { + super(aParams, aXslt, aOs); + } + + + /* (non-Javadoc) + * @see org.wamblee.crawler.PageRequest#execute(java.lang.String, org.apache.commons.httpclient.HttpClient) + */ + public Document execute(String aUrl, HttpClient aClient) { + PostMethod method = new PostMethod(aUrl); + method.addParameters(getParameters()); + return executeMethod(aClient, method); + } + +} diff --git a/crawler/src/org/wamblee/crawler/impl/ActionImpl.java b/crawler/src/org/wamblee/crawler/impl/ActionImpl.java new file mode 100644 index 00000000..d0fe0806 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/ActionImpl.java @@ -0,0 +1,75 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import org.dom4j.Element; +import org.wamblee.crawler.Action; +import org.wamblee.crawler.Crawler; +import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageType; + +/** + * + */ +public class ActionImpl implements Action { + + private Crawler _crawler; + private Element _content; + private String _name; + private String _reference; + private PageType _type; + + public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference) { + _crawler = aCrawler; + _content = aContent; + _name = aName; + _reference = aReference; + _type = null; + } + + public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference, PageType aType) { + _crawler = aCrawler; + _content = aContent; + _name = aName; + _reference = aReference; + _type = aType; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.Action#getName() + */ + public String getName() { + return _name; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.Action#execute() + */ + public Page execute() { + if ( _type == null) { + return _crawler.getPage(_reference); + } + return _crawler.getPage(_reference, _type); + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.Action#getContent() + */ + public Element getContent() { + return _content; + } +} diff --git a/crawler/src/org/wamblee/crawler/impl/App.java b/crawler/src/org/wamblee/crawler/impl/App.java new file mode 100644 index 00000000..75fd3b09 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/App.java @@ -0,0 +1,108 @@ +package org.wamblee.crawler.impl; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.PrintStream; + +import org.apache.commons.httpclient.HttpClient; +import org.dom4j.Element; +import org.wamblee.crawler.Action; +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.Crawler; +import org.wamblee.crawler.Page; + +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Entry point for the crawler. + */ +public class App { + + private static final String LOG_FILE = "crawler.log"; + + public static void main(String[] args) throws Exception { + String configFileName = args[0]; + String starturl = args[1]; + + FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); + PrintStream os = new PrintStream(fos); + + try { + ConfigurationParser parser = new ConfigurationParser(os); + InputStream configFile = new FileInputStream(new File( + configFileName)); + Configuration config = parser.parse(configFile); + + HttpClient client = new HttpClient(); + // client.getHostConfiguration().setProxy("localhost", 3128); + + Crawler crawler = new CrawlerImpl(client, config); + + System.out.println("Retrieving: " + starturl); + Page page = crawler.getPage(starturl); + showPage(page); + page = page.getAction("channels-favorites").execute(); + recordInterestingShows(page); + showPage(page); + page = page.getAction("Nederland 1").execute(); + showPage(page); + page = page.getAction("right-now").execute(); + showPage(page); + page = page.getAction("Het elfde uur").execute(); + showPage(page); + } finally { + os.flush(); + os.close(); + System.out.println("Output written on '" + LOG_FILE + "'"); + } + } + + /** + * @param starturl + * @param crawler + */ + private static void showPage(Page aPage) { + Action[] links = aPage.getActions(); + for (Action link: links) { + System.out.println("Link found '" + link.getName() + "'"); + } + Element element = aPage.getContent(); + System.out.println("Retrieved content: " + element.asXML()); + } + + private static void recordInterestingShows(Page page) { + Action[] channels = page.getActions(); + for (Action channel: channels) { + examineChannel(channel.getName(), channel.execute().getAction("right-now").execute()); + } + } + + private static void examineChannel(String aChannel, Page aPage) { + Action[] programs = aPage.getActions(); + for (Action program: programs) { + System.out.println(aChannel + " - " + program.getName()); + if ( program.getName().toLowerCase().matches(".*babe.*")) { + Page programPage = program.execute(); + Action record = programPage.getAction("record"); + System.out.println("Recording possible: " + record != null); + } + } + } + +} diff --git a/crawler/src/org/wamblee/crawler/impl/ConfigItem.java b/crawler/src/org/wamblee/crawler/impl/ConfigItem.java new file mode 100644 index 00000000..7dfd9169 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/ConfigItem.java @@ -0,0 +1,41 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import java.util.regex.Pattern; + +/** + * + */ +class ConfigItem { + + private Pattern _pattern; + private ValueType _value; + + protected ConfigItem(String aPattern, ValueType aValue) { + _pattern = Pattern.compile(aPattern); + _value = aValue; + } + + protected ValueType match(String aValue) { + if ( !_pattern.matcher(aValue).matches() ) { + return null; + } + return _value; + } + +} diff --git a/crawler/src/org/wamblee/crawler/impl/ConfigurationImpl.java b/crawler/src/org/wamblee/crawler/impl/ConfigurationImpl.java new file mode 100644 index 00000000..54874559 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/ConfigurationImpl.java @@ -0,0 +1,65 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import java.util.List; + +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.PageRequest; +import org.wamblee.crawler.PageType; + + +/** + * Implementation of the configuration for the crawler. + */ +public class ConfigurationImpl implements Configuration { + + private List _urlConfig; + private List _pageTypeConfig; + + public ConfigurationImpl(List aUrlConfig, List aPageTypeConfig) { + _urlConfig = aUrlConfig; + _pageTypeConfig = aPageTypeConfig; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.Configuration#getRequest(java.lang.String) + */ + public PageRequest getRequest(String aUrl) { + + for (UrlConfig config: _urlConfig) { + PageRequest request = config.getRequest(aUrl); + if ( request != null ) { + return request; + } + } + throw new RuntimeException("No configuration matched the URL '" + aUrl + "'"); + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.Configuration#getRequest(org.wamblee.crawler.PageType) + */ + public PageRequest getRequest(PageType aType) { + for (PageTypeConfig config: _pageTypeConfig) { + PageRequest request = config.getRequest(aType.getType()); + if ( request != null ) { + return request; + } + } + throw new RuntimeException("No configuration matched type '" + aType + "'"); + } +} diff --git a/crawler/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/src/org/wamblee/crawler/impl/ConfigurationParser.java new file mode 100644 index 00000000..89e815c8 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -0,0 +1,141 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import java.io.InputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.httpclient.NameValuePair; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.GetPageRequest; +import org.wamblee.crawler.PageRequest; +import org.wamblee.crawler.PostPageRequest; + +/** + * Parsing of the configuration from an XML file. + */ +public class ConfigurationParser { + + private static final String ELEM_URL = "url"; + private static final String ELEM_TYPE = "type"; + private static final String ELEM_PATTERN = "pattern"; + private static final String ELEM_METHOD= "method"; + private static final String ELEM_XSLT = "xslt"; + private static final String ELEM_PARAM = "param"; + private static final String AT_NAME = "name"; + private static final String AT_VALUE = "value"; + + private static final String METHOD_POST = "post"; + private static final String METHOD_GET = "get"; + + private PrintStream _os; + + public ConfigurationParser(PrintStream aOs) { + _os = aOs; + } + + public Configuration parse(InputStream aStream) { + try { + SAXReader reader = new SAXReader(); + Document document = reader.read(aStream); + + Element root = document.getRootElement(); + List urlConfigs = parseUrlConfigs(root); + List pageTypeConfigs = parsePageTypeConfigs(root); + return new ConfigurationImpl(urlConfigs, pageTypeConfigs); + } catch (DocumentException e) { + throw new RuntimeException("Problem parsing config file", e); + } + } + + /** + * @param root + * @return + */ + private List parseUrlConfigs(Element root) { + List configs = new ArrayList(); + for (Iterator i = root.elementIterator(ELEM_URL); i.hasNext(); ) { + Element url = (Element)i.next(); + UrlConfig config = parseUrlConfig(url); + configs.add(config); + } + return configs; + } + + private List parsePageTypeConfigs(Element root) { + List configs = new ArrayList(); + for (Iterator i = root.elementIterator(ELEM_TYPE); i.hasNext(); ) { + Element url = (Element)i.next(); + PageTypeConfig config = parsePageTypeConfig(url); + configs.add(config); + } + return configs; + } + + private UrlConfig parseUrlConfig(Element aUrlElem) { + String pattern = aUrlElem.elementText(ELEM_PATTERN); + PageRequest request = parseRequestConfig(aUrlElem); + return new UrlConfig(pattern, request); + } + + private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { + String pattern = aTypeElem.elementText(ELEM_PATTERN); + PageRequest request = parseRequestConfig(aTypeElem); + return new PageTypeConfig(pattern, request); + } + + /** + * @param aUrlElem + * @return + */ + private PageRequest parseRequestConfig(Element aUrlElem) { + String method = aUrlElem.elementText(ELEM_METHOD); + String xslt = aUrlElem.elementText(ELEM_XSLT); + List params = new ArrayList(); + for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext(); ) { + Element paramElem = (Element)i.next(); + NameValuePair param = parseParameter(paramElem); + params.add(param); + } + + NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); + PageRequest request; + if ( METHOD_POST.equals(method)) { + request = new PostPageRequest(paramsArray, xslt, _os); + } + else if ( METHOD_GET.equals(method) || method == null ){ + request = new GetPageRequest(paramsArray, xslt, _os); + } else { + throw new RuntimeException("Unknown request method '" + method + "'. Only " + + METHOD_GET + " and " + METHOD_POST + " are supported"); + } + return request; + } + + private NameValuePair parseParameter(Element aParam) { + String name = aParam.attributeValue(AT_NAME); + String value = aParam.attributeValue(AT_VALUE); + return new NameValuePair(name, value); + } +} diff --git a/crawler/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/src/org/wamblee/crawler/impl/CrawlerImpl.java new file mode 100644 index 00000000..8db31606 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -0,0 +1,91 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Element; +import org.dom4j.io.DOMReader; +import org.w3c.dom.Document; +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.Crawler; +import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageRequest; +import org.wamblee.crawler.PageType; + +/** + * Crawler implementation. + */ +public class CrawlerImpl implements Crawler { + + private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); + + private HttpClient _client; + private Configuration _config; + + public CrawlerImpl(HttpClient aClient, Configuration aConfig) { + _client = aClient; + _config = aConfig; + } + + /* + * (non-Javadoc) + * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) + */ + public Page getPage(String aUrl) { + LOG.info("Getting page: url = '" + aUrl + "'"); + PageRequest request = _config.getRequest(aUrl); + Document content = request.execute(aUrl, _client); + return transformToDom4jDoc(content); + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) + */ + public Page getPage(String aUrl, PageType aType) { + LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); + PageRequest request = _config.getRequest(aType); + Document content = request.execute(aUrl, _client); + return transformToDom4jDoc(content); + } + + /** + * @param aUrl + * @param request + * @return + */ + private Page transformToDom4jDoc(Document content) { + + DOMReader reader = new DOMReader(); + org.dom4j.Document dom4jDoc = reader.read(content); + Element root = dom4jDoc.getRootElement(); + dom4jDoc.remove(root); + + return new PageImpl(this, replaceReferencesWithContent(root)); + } + + /** + * Perform crawling. Find references in the retrieved content and replace them + * by the content they refer to by retrieving the appropriate pages as well. + * @param content Content which must be made complete. + * @return Fully processed content. + */ + private Element replaceReferencesWithContent(Element content) { + return content; // TODO implement. + } +} diff --git a/crawler/src/org/wamblee/crawler/impl/PageImpl.java b/crawler/src/org/wamblee/crawler/impl/PageImpl.java new file mode 100644 index 00000000..90512222 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/PageImpl.java @@ -0,0 +1,118 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import java.util.ArrayList; +import java.util.List; + +import org.dom4j.DocumentHelper; +import org.dom4j.Element; +import org.dom4j.XPath; +import org.wamblee.crawler.Action; +import org.wamblee.crawler.Crawler; +import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageType; + +/** + * Page implementation. + */ +public class PageImpl implements Page { + + private static final String ELEM_NAME = "action"; + + private static final String ATT_NAME = "name"; + + private static final String ATT_HREF = "reference"; + + private static final String ATT_TYPE = "type"; + + private Crawler _crawler; + + private Element _content; + + private Action[] _actions; + + /** + * Constructs a page. + * + * @param aContent + */ + public PageImpl(Crawler aCrawler, Element aContent) { + _crawler = aCrawler; + _content = aContent; + _actions = computeActions(); + } + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.Page#getLinkNames() + */ + private Action[] computeActions() { + XPath xpath = DocumentHelper.createXPath(ELEM_NAME); + List results = (List) xpath.selectNodes(_content); + List names = new ArrayList(); + for (Element elem : results) { + String name = elem.attributeValue(ATT_NAME); + String href = elem.attributeValue(ATT_HREF); + String type = elem.attributeValue(ATT_TYPE); + if (type == null ) { + names.add(new ActionImpl(_crawler, elem, name, href)); + } + else { + names.add(new ActionImpl(_crawler, elem, name, href, new PageType(type))); + } + } + return names.toArray(new Action[0]); + } + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.Page#getContent() + */ + public Element getContent() { + return _content; + } + + /* (non-Javadoc) + * @see org.wamblee.crawler.Page#getActions() + */ + public Action[] getActions() { + return _actions; + } + + /* + * (non-Javadoc) + * @see org.wamblee.crawler.Page#getAction(java.lang.String) + */ + public Action getAction(String aName) { + List results = new ArrayList(); + for (Action action: _actions) { + if ( action.getName().equals(aName)) { + results.add(action); + } + } + if (results.size() == 0) { + return null; + } + if (results.size() > 1) { + throw new RuntimeException("Duplicate link '" + aName + "'"); + } + return results.get(0); + } +} diff --git a/crawler/src/org/wamblee/crawler/impl/PageTypeConfig.java b/crawler/src/org/wamblee/crawler/impl/PageTypeConfig.java new file mode 100644 index 00000000..08ef1605 --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/PageTypeConfig.java @@ -0,0 +1,33 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import org.wamblee.crawler.PageRequest; + +/** + * + */ +public class PageTypeConfig extends ConfigItem { + + public PageTypeConfig(String aPattern, PageRequest aRequest) { + super(aPattern, aRequest); + } + + public PageRequest getRequest(String aType) { + return match(aType); + } +} diff --git a/crawler/src/org/wamblee/crawler/impl/UrlConfig.java b/crawler/src/org/wamblee/crawler/impl/UrlConfig.java new file mode 100644 index 00000000..1a2b06ab --- /dev/null +++ b/crawler/src/org/wamblee/crawler/impl/UrlConfig.java @@ -0,0 +1,46 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import org.wamblee.crawler.PageRequest; + +/** + * Represents the configuration for specific URLs. + */ +public class UrlConfig extends ConfigItem { + /** + * Constructs the information for how to perform a request for a specific + * URL. + * + * @param aPattern + * Pattern that the URL must match. + * @param aRequest + * Request that must be executed to retrieve the URL. + */ + public UrlConfig(String aPattern, PageRequest aRequest) { + super(aPattern, aRequest); + } + + /** + * Gets the request to execute. + * + * @return Request, or null if the URL does not match. + */ + public PageRequest getRequest(String aUrl) { + return match(aUrl); + } +}