/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.impl; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.httpclient.NameValuePair; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.GetPageRequest; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PostPageRequest; import org.wamblee.xml.XslTransformer; /** * Parsing of the configuration from an XML file. */ public class ConfigurationParser { private static final String ELEM_URL = "url"; private static final String ELEM_TYPE = "type"; private static final String ELEM_PATTERN = "pattern"; private static final String ELEM_METHOD = "method"; private static final String ELEM_XSLT = "xslt"; private static final String ELEM_PARAM = "param"; private static final String ELEM_HEADER = "header"; private static final String AT_NAME = "name"; private static final String AT_VALUE = "value"; private static final String METHOD_POST = "post"; private static final String METHOD_GET = "get"; private static final int MAX_TRIES = 3; private static final int MAX_DELAY = 5000; private XslTransformer _transformer; /** * Constructs the configuration parser. */ public ConfigurationParser(XslTransformer aTransformer) { _transformer = aTransformer; } /** * Parses the configuration from an input stream. * @param aStream Input file. * @return Configuration. */ public Configuration parse(InputStream aStream) { try { SAXReader reader = new SAXReader(); Document document = reader.read(aStream); Element root = document.getRootElement(); List urlConfigs = parseUrlConfigs(root); List pageTypeConfigs = parsePageTypeConfigs(root); return new ConfigurationImpl(urlConfigs, pageTypeConfigs); } catch (DocumentException e) { throw new RuntimeException("Problem parsing config file", e); } } /** * Parses the URL-based configuration. * @param aRoot Root of the configuration file document. * @return List of URL-based configurations. */ private List parseUrlConfigs(Element aRoot) { List configs = new ArrayList(); for (Iterator i = aRoot.elementIterator(ELEM_URL); i.hasNext();) { Element url = (Element) i.next(); UrlConfig config = parseUrlConfig(url); configs.add(config); } return configs; } /** * Parses the page type based configurations. * @param aRoot Root of the configuration file document. * @return LIst of page type based configurations. */ private List parsePageTypeConfigs(Element aRoot) { List configs = new ArrayList(); for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) { Element url = (Element) i.next(); PageTypeConfig config = parsePageTypeConfig(url); configs.add(config); } return configs; } /** * Parses a URL-based configuration. * @param aUrlElem Configuration element. * @return Configuration. */ private UrlConfig parseUrlConfig(Element aUrlElem) { String pattern = aUrlElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aUrlElem); return new UrlConfig(pattern, request); } /** * Parses a page type based configuration. * @param aTypeElem Configuration element. * @return Configuration. */ private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { String pattern = aTypeElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aTypeElem); return new PageTypeConfig(pattern, request); } /** * Parses a request configuration describing how to execute requests. * @param aElem Configuration element. * @return Page request. */ private PageRequest parseRequestConfig(Element aElem) { String method = aElem.elementText(ELEM_METHOD); String xslt = aElem.elementText(ELEM_XSLT); List params = parseNameValuePairs(aElem, ELEM_PARAM); List headers = parseNameValuePairs(aElem, ELEM_HEADER); NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); NameValuePair[] headersArray = headers.toArray(new NameValuePair[0]); PageRequest request; if (METHOD_POST.equals(method)) { request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, xslt, _transformer); } else if (METHOD_GET.equals(method) || method == null) { request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, xslt, _transformer); } else { throw new RuntimeException("Unknown request method '" + method + "'. Only " + METHOD_GET + " and " + METHOD_POST + " are supported"); } return request; } /** * @param aElem * @return */ private List parseNameValuePairs(Element aElem, String aElemName) { List headers = new ArrayList(); for (Iterator i = aElem.elementIterator(aElemName); i.hasNext();) { Element paramElem = (Element) i.next(); NameValuePair header = parseParameter(paramElem); headers.add(header); } return headers; } /** * Parses a parameter definition. * @param aParam Parameter. * @return Name value pair describing a parameter. */ private NameValuePair parseParameter(Element aParam) { String name = aParam.attributeValue(AT_NAME); String value = aParam.attributeValue(AT_VALUE); return new NameValuePair(name, value); } }