X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FConfigurationParser.java;fp=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FConfigurationParser.java;h=0000000000000000000000000000000000000000;hb=62f165891f08ae532b5a794af11d7338a93f9a43;hp=e88f4007a1dbe8d5b0c38e6b93ae816a482fbfb8;hpb=07cedd3f0730646ea35a7f668b3e1e872a4605d9;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java deleted file mode 100644 index e88f4007..00000000 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright 2005 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.wamblee.crawler.impl; - -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.commons.httpclient.NameValuePair; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; -import org.wamblee.crawler.Configuration; -import org.wamblee.crawler.GetPageRequest; -import org.wamblee.crawler.PageRequest; -import org.wamblee.crawler.PostPageRequest; -import org.wamblee.xml.XslTransformer; - -/** - * Parsing of the configuration from an XML file. - */ -public class ConfigurationParser { - - private static final String ELEM_URL = "url"; - - private static final String ELEM_TYPE = "type"; - - private static final String ELEM_PATTERN = "pattern"; - - private static final String ELEM_METHOD = "method"; - - private static final String ELEM_XSLT = "xslt"; - - private static final String ELEM_PARAM = "param"; - - private static final String ELEM_HEADER = "header"; - - private static final String AT_NAME = "name"; - - private static final String AT_VALUE = "value"; - - private static final String METHOD_POST = "post"; - - private static final String METHOD_GET = "get"; - - private static final int MAX_TRIES = 3; - - private static final int MAX_DELAY = 10000; - - private XslTransformer _transformer; - - /** - * Constructs the configuration parser. - */ - public ConfigurationParser(XslTransformer aTransformer) { - _transformer = aTransformer; - } - - /** - * Parses the configuration from an input stream. - * @param aStream Input file. - * @return Configuration. - */ - public Configuration parse(InputStream aStream) { - try { - SAXReader reader = new SAXReader(); - Document document = reader.read(aStream); - - Element root = document.getRootElement(); - List urlConfigs = parseUrlConfigs(root); - List pageTypeConfigs = parsePageTypeConfigs(root); - return new ConfigurationImpl(urlConfigs, pageTypeConfigs); - } catch (DocumentException e) { - throw new RuntimeException("Problem parsing config file", e); - } - } - - /** - * Parses the URL-based configuration. - * @param aRoot Root of the configuration file document. - * @return List of URL-based configurations. - */ - private List parseUrlConfigs(Element aRoot) { - List configs = new ArrayList(); - for (Iterator i = aRoot.elementIterator(ELEM_URL); i.hasNext();) { - Element url = (Element) i.next(); - UrlConfig config = parseUrlConfig(url); - configs.add(config); - } - return configs; - } - - /** - * Parses the page type based configurations. - * @param aRoot Root of the configuration file document. - * @return LIst of page type based configurations. - */ - private List parsePageTypeConfigs(Element aRoot) { - List configs = new ArrayList(); - for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) { - Element url = (Element) i.next(); - PageTypeConfig config = parsePageTypeConfig(url); - configs.add(config); - } - return configs; - } - - /** - * Parses a URL-based configuration. - * @param aUrlElem Configuration element. - * @return Configuration. - */ - private UrlConfig parseUrlConfig(Element aUrlElem) { - String pattern = aUrlElem.elementText(ELEM_PATTERN); - PageRequest request = parseRequestConfig(aUrlElem); - return new UrlConfig(pattern, request); - } - - /** - * Parses a page type based configuration. - * @param aTypeElem Configuration element. - * @return Configuration. - */ - private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { - String pattern = aTypeElem.elementText(ELEM_PATTERN); - PageRequest request = parseRequestConfig(aTypeElem); - return new PageTypeConfig(pattern, request); - } - - /** - * Parses a request configuration describing how to execute requests. - * @param aElem Configuration element. - * @return Page request. - */ - private PageRequest parseRequestConfig(Element aElem) { - String method = aElem.elementText(ELEM_METHOD); - String xslt = aElem.elementText(ELEM_XSLT); - List params = parseNameValuePairs(aElem, ELEM_PARAM); - List headers = parseNameValuePairs(aElem, ELEM_HEADER); - - NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); - NameValuePair[] headersArray = headers.toArray(new NameValuePair[0]); - PageRequest request; - if (METHOD_POST.equals(method)) { - request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, - xslt, _transformer); - } else if (METHOD_GET.equals(method) || method == null) { - request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, - xslt, _transformer); - } else { - throw new RuntimeException("Unknown request method '" + method - + "'. Only " + METHOD_GET + " and " + METHOD_POST - + " are supported"); - } - return request; - } - - /** - * @param aElem - * @return - */ - private List parseNameValuePairs(Element aElem, String aElemName) { - List headers = new ArrayList(); - for (Iterator i = aElem.elementIterator(aElemName); i.hasNext();) { - Element paramElem = (Element) i.next(); - NameValuePair header = parseParameter(paramElem); - headers.add(header); - } - return headers; - } - - /** - * Parses a parameter definition. - * @param aParam Parameter. - * @return Name value pair describing a parameter. - */ - private NameValuePair parseParameter(Element aParam) { - String name = aParam.attributeValue(AT_NAME); - String value = aParam.attributeValue(AT_VALUE); - return new NameValuePair(name, value); - } -}