+++ /dev/null
-/*
- * Copyright 2005 the original author or authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.wamblee.crawler.impl;
-
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.commons.httpclient.NameValuePair;
-import org.dom4j.Document;
-import org.dom4j.DocumentException;
-import org.dom4j.Element;
-import org.dom4j.io.SAXReader;
-import org.wamblee.crawler.Configuration;
-import org.wamblee.crawler.GetPageRequest;
-import org.wamblee.crawler.PageRequest;
-import org.wamblee.crawler.PostPageRequest;
-import org.wamblee.xml.XslTransformer;
-
-/**
- * Parsing of the configuration from an XML file.
- */
-public class ConfigurationParser {
-
- private static final String ELEM_URL = "url";
-
- private static final String ELEM_TYPE = "type";
-
- private static final String ELEM_PATTERN = "pattern";
-
- private static final String ELEM_METHOD = "method";
-
- private static final String ELEM_XSLT = "xslt";
-
- private static final String ELEM_PARAM = "param";
-
- private static final String ELEM_HEADER = "header";
-
- private static final String AT_NAME = "name";
-
- private static final String AT_VALUE = "value";
-
- private static final String METHOD_POST = "post";
-
- private static final String METHOD_GET = "get";
-
- private static final int MAX_TRIES = 3;
-
- private static final int MAX_DELAY = 10000;
-
- private XslTransformer _transformer;
-
- /**
- * Constructs the configuration parser.
- */
- public ConfigurationParser(XslTransformer aTransformer) {
- _transformer = aTransformer;
- }
-
- /**
- * Parses the configuration from an input stream.
- * @param aStream Input file.
- * @return Configuration.
- */
- public Configuration parse(InputStream aStream) {
- try {
- SAXReader reader = new SAXReader();
- Document document = reader.read(aStream);
-
- Element root = document.getRootElement();
- List<UrlConfig> urlConfigs = parseUrlConfigs(root);
- List<PageTypeConfig> pageTypeConfigs = parsePageTypeConfigs(root);
- return new ConfigurationImpl(urlConfigs, pageTypeConfigs);
- } catch (DocumentException e) {
- throw new RuntimeException("Problem parsing config file", e);
- }
- }
-
- /**
- * Parses the URL-based configuration.
- * @param aRoot Root of the configuration file document.
- * @return List of URL-based configurations.
- */
- private List<UrlConfig> parseUrlConfigs(Element aRoot) {
- List<UrlConfig> configs = new ArrayList<UrlConfig>();
- for (Iterator i = aRoot.elementIterator(ELEM_URL); i.hasNext();) {
- Element url = (Element) i.next();
- UrlConfig config = parseUrlConfig(url);
- configs.add(config);
- }
- return configs;
- }
-
- /**
- * Parses the page type based configurations.
- * @param aRoot Root of the configuration file document.
- * @return LIst of page type based configurations.
- */
- private List<PageTypeConfig> parsePageTypeConfigs(Element aRoot) {
- List<PageTypeConfig> configs = new ArrayList<PageTypeConfig>();
- for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) {
- Element url = (Element) i.next();
- PageTypeConfig config = parsePageTypeConfig(url);
- configs.add(config);
- }
- return configs;
- }
-
- /**
- * Parses a URL-based configuration.
- * @param aUrlElem Configuration element.
- * @return Configuration.
- */
- private UrlConfig parseUrlConfig(Element aUrlElem) {
- String pattern = aUrlElem.elementText(ELEM_PATTERN);
- PageRequest request = parseRequestConfig(aUrlElem);
- return new UrlConfig(pattern, request);
- }
-
- /**
- * Parses a page type based configuration.
- * @param aTypeElem Configuration element.
- * @return Configuration.
- */
- private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
- String pattern = aTypeElem.elementText(ELEM_PATTERN);
- PageRequest request = parseRequestConfig(aTypeElem);
- return new PageTypeConfig(pattern, request);
- }
-
- /**
- * Parses a request configuration describing how to execute requests.
- * @param aElem Configuration element.
- * @return Page request.
- */
- private PageRequest parseRequestConfig(Element aElem) {
- String method = aElem.elementText(ELEM_METHOD);
- String xslt = aElem.elementText(ELEM_XSLT);
- List<NameValuePair> params = parseNameValuePairs(aElem, ELEM_PARAM);
- List<NameValuePair> headers = parseNameValuePairs(aElem, ELEM_HEADER);
-
- NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]);
- NameValuePair[] headersArray = headers.toArray(new NameValuePair[0]);
- PageRequest request;
- if (METHOD_POST.equals(method)) {
- request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray,
- xslt, _transformer);
- } else if (METHOD_GET.equals(method) || method == null) {
- request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray,
- xslt, _transformer);
- } else {
- throw new RuntimeException("Unknown request method '" + method
- + "'. Only " + METHOD_GET + " and " + METHOD_POST
- + " are supported");
- }
- return request;
- }
-
- /**
- * @param aElem
- * @return
- */
- private List<NameValuePair> parseNameValuePairs(Element aElem, String aElemName) {
- List<NameValuePair> headers = new ArrayList<NameValuePair>();
- for (Iterator i = aElem.elementIterator(aElemName); i.hasNext();) {
- Element paramElem = (Element) i.next();
- NameValuePair header = parseParameter(paramElem);
- headers.add(header);
- }
- return headers;
- }
-
- /**
- * Parses a parameter definition.
- * @param aParam Parameter.
- * @return Name value pair describing a parameter.
- */
- private NameValuePair parseParameter(Element aParam) {
- String name = aParam.attributeValue(AT_NAME);
- String value = aParam.attributeValue(AT_VALUE);
- return new NameValuePair(name, value);
- }
-}