import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
-import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.impl;
import org.wamblee.crawler.PageType;
/**
- *
+ * Action implementation.
*/
public class ActionImpl implements Action {
-
- private Crawler _crawler;
- private Element _content;
- private String _name;
- private String _reference;
- private PageType _type;
-
- public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference) {
- _crawler = aCrawler;
- _content = aContent;
+
+ private Crawler _crawler;
+
+ private Element _content;
+
+ private String _name;
+
+ private String _reference;
+
+ private PageType _type;
+
+ /**
+ * Constructs the action.
+ *
+ * @param aCrawler
+ * Crawler to use.
+ * @param aContent
+ * Content of the action element in the page where the action
+ * occurs.
+ * @param aName
+ * Name of the action.
+ * @param aReference
+ * URL of the reference.
+ */
+ public ActionImpl(Crawler aCrawler, Element aContent, String aName,
+ String aReference) {
+ _crawler = aCrawler;
+ _content = aContent;
_name = aName;
_reference = aReference;
- _type = null;
+ _type = null;
}
-
- public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference, PageType aType) {
+
+ /**
+ * Constructs the action.
+ *
+ * @param aCrawler
+ * Crawler to use.
+ * @param aContent
+ * Content of the action element in the page where the action
+ * occurs.
+ * @param aName
+ * Name of the action.
+ * @param aReference
+ * URL of the reference.
+ * @param aType
+ * Type of the referenced page.
+ */
+ public ActionImpl(Crawler aCrawler, Element aContent, String aName,
+ String aReference, PageType aType) {
_crawler = aCrawler;
- _content = aContent;
+ _content = aContent;
_name = aName;
- _reference = aReference;
- _type = aType;
+ _reference = aReference;
+ _type = aType;
}
- /* (non-Javadoc)
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Action#getName()
*/
public String getName() {
return _name;
}
-
- /* (non-Javadoc)
+
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Action#execute()
*/
public Page execute() throws PageException {
- if ( _type == null) {
+ if (_type == null) {
return _crawler.getPage(_reference);
}
return _crawler.getPage(_reference, _type);
}
-
- /* (non-Javadoc)
+
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Action#getContent()
*/
public Element getContent() {
- return _content;
+ return _content;
}
}
private static final String LOG_FILE = "crawler.log";
+ /**
+ * Runs a test program.
+ * @param aArgs Arguments. First argument is the crawler config file name and second argument is
+ * the start url.
+ * @throws Exception In case of problems.
+ */
public static void main(String[] aArgs) throws Exception {
String configFileName = aArgs[0];
String starturl = aArgs[1];
import java.util.regex.Pattern;
/**
- *
+ * Configuration item for obtaining an object in case a pattern matches.
*/
class ConfigItem<ValueType> {
private ValueType _value;
+ /**
+ * Constructs the item.
+ * @param aPattern Pattern.
+ * @param aValue Value.
+ */
protected ConfigItem(String aPattern, ValueType aValue) {
_pattern = Pattern.compile(aPattern);
_value = aValue;
}
+ /**
+ * Returns the object in case the value matches.
+ * @param aValue Value to match.
+ * @return Object in case there is a match, null otherwise.
+ */
protected ValueType match(String aValue) {
if (!_pattern.matcher(aValue).matches()) {
return null;
private List<PageTypeConfig> _pageTypeConfig;
+ /**
+ * Constructs the configuration.
+ * @param aUrlConfig List of URL configuration elements.
+ * @param aPageTypeConfig List of page type configuration elements.
+ */
public ConfigurationImpl(List<UrlConfig> aUrlConfig,
List<PageTypeConfig> aPageTypeConfig) {
_urlConfig = aUrlConfig;
private static final int MAX_DELAY = 5000;
private PrintStream _os;
-
+
+ /**
+ * Constructs the configuration parser.
+ * @param aOs The stream for logging requests.
+ * TODO plain java logging should be used instead of this awkward mechanism.
+ */
public ConfigurationParser(PrintStream aOs) {
_os = aOs;
}
+ /**
+ * Parses the configuration from an input stream.
+ * @param aStream Input file.
+ * @return Configuration.
+ */
public Configuration parse(InputStream aStream) {
try {
SAXReader reader = new SAXReader();
}
/**
- * @param aRoot
- * @return
+ * Parses the URL-based configuration.
+ * @param aRoot Root of the configuration file document.
+ * @return List of URL-based configurations.
*/
private List<UrlConfig> parseUrlConfigs(Element aRoot) {
List<UrlConfig> configs = new ArrayList<UrlConfig>();
return configs;
}
+ /**
+ * Parses the page type based configurations.
+ * @param aRoot Root of the configuration file document.
+ * @return LIst of page type based configurations.
+ */
private List<PageTypeConfig> parsePageTypeConfigs(Element aRoot) {
List<PageTypeConfig> configs = new ArrayList<PageTypeConfig>();
for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) {
return configs;
}
+ /**
+ * Parses a URL-based configuration.
+ * @param aUrlElem Configuration element.
+ * @return Configuration.
+ */
private UrlConfig parseUrlConfig(Element aUrlElem) {
String pattern = aUrlElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aUrlElem);
return new UrlConfig(pattern, request);
}
+ /**
+ * Parses a page type based configuration.
+ * @param aTypeElem Configuration element.
+ * @return Configuration.
+ */
private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
String pattern = aTypeElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aTypeElem);
}
/**
- * @param aUrlElem
- * @return
+ * Parses a request configuration describing how to execute requests.
+ * @param aElem Configuration element.
+ * @return Page request.
*/
- private PageRequest parseRequestConfig(Element aUrlElem) {
- String method = aUrlElem.elementText(ELEM_METHOD);
- String xslt = aUrlElem.elementText(ELEM_XSLT);
+ private PageRequest parseRequestConfig(Element aElem) {
+ String method = aElem.elementText(ELEM_METHOD);
+ String xslt = aElem.elementText(ELEM_XSLT);
List<NameValuePair> params = new ArrayList<NameValuePair>();
- for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext();) {
+ for (Iterator i = aElem.elementIterator(ELEM_PARAM); i.hasNext();) {
Element paramElem = (Element) i.next();
NameValuePair param = parseParameter(paramElem);
params.add(param);
return request;
}
+ /**
+ * Parses a parameter definition.
+ * @param aParam Parameter.
+ * @return Name value pair describing a parameter.
+ */
private NameValuePair parseParameter(Element aParam) {
String name = aParam.attributeValue(AT_NAME);
String value = aParam.attributeValue(AT_VALUE);
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.impl;
import org.wamblee.crawler.PageType;
/**
- * Crawler implementation.
+ * Crawler implementation.
*/
public class CrawlerImpl implements Crawler {
-
+
private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
- private static final int MAX_DELAY = 5000;
-
- private HttpClient _client;
+
+ private static final int MAX_DELAY = 5000;
+
+ private HttpClient _client;
+
private Configuration _config;
-
+
+ /**
+ * Constructs the crawler.
+ *
+ * @param aClient
+ * Http client to use.
+ * @param aConfig
+ * Configuration.
+ */
public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
- _client = aClient;
- _config = aConfig;
+ _client = aClient;
+ _config = aConfig;
}
/*
- * (non-Javadoc)
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
*/
public Page getPage(String aUrl) throws PageException {
LOG.info("Getting page: url = '" + aUrl + "'");
PageRequest request = _config.getRequest(aUrl);
Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ return transformToDom4jDoc(content);
}
-
- /* (non-Javadoc)
- * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
+ * java.lang.String)
*/
public Page getPage(String aUrl, PageType aType) throws PageException {
LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
PageRequest request = _config.getRequest(aType);
Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ return transformToDom4jDoc(content);
}
-
+
/**
- * @param aUrl
- * @param request
+ * Converts a w3c DOM document to a page object.
+ * @param content DOM document.
* @return
*/
private Page transformToDom4jDoc(Document content) {
-
DOMReader reader = new DOMReader();
org.dom4j.Document dom4jDoc = reader.read(content);
Element root = dom4jDoc.getRootElement();
dom4jDoc.remove(root);
-
+
return new PageImpl(this, replaceReferencesWithContent(root));
}
-
+
/**
- * Perform crawling. Find references in the retrieved content and replace them
- * by the content they refer to by retrieving the appropriate pages as well.
- * @param content Content which must be made complete.
- * @return Fully processed content.
+ * Perform crawling. Find references in the retrieved content and replace
+ * them by the content they refer to by retrieving the appropriate pages as
+ * well.
+ *
+ * @param content
+ * Content which must be made complete.
+ * @return Fully processed content.
*/
- private Element replaceReferencesWithContent(Element content) {
- return content; // TODO implement.
+ private Element replaceReferencesWithContent(Element content) {
+ return content; // TODO implement.
}
}
return null;
}
if (results.size() > 1) {
- throw new RuntimeException("Duplicate link '" + aName + "'");
+ throw new RuntimeException("Duplicate action '" + aName + "'");
}
return results.get(0);
}
import org.wamblee.crawler.PageRequest;
/**
- *
+ * Page type configuration.
*/
public class PageTypeConfig extends ConfigItem<PageRequest> {
+ /**
+ * Constructs the configuration.
+ * @param aPattern Page type pattern.
+ * @param aRequest Page request.
+ */
public PageTypeConfig(String aPattern, PageRequest aRequest) {
super(aPattern, aRequest);
}
+ /**
+ * Returns the request in case the type matches.
+ * @param aType Page type.
+ * @return Request if the type matches, null otherwise.
+ */
public PageRequest getRequest(String aType) {
return match(aType);
}