package org.wamblee.crawler.impl;
import java.io.InputStream;
-import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.wamblee.crawler.GetPageRequest;
import org.wamblee.crawler.PageRequest;
import org.wamblee.crawler.PostPageRequest;
+import org.wamblee.xml.XslTransformer;
/**
* Parsing of the configuration from an XML file.
private static final String ELEM_XSLT = "xslt";
private static final String ELEM_PARAM = "param";
+
+ private static final String ELEM_HEADER = "header";
private static final String AT_NAME = "name";
private static final int MAX_TRIES = 3;
- private static final int MAX_DELAY = 5000;
+ private static final int MAX_DELAY = 1000;
+
+ private XslTransformer _transformer;
- private PrintStream _os;
-
- public ConfigurationParser(PrintStream aOs) {
- _os = aOs;
+ /**
+ * Constructs the configuration parser.
+ */
+ public ConfigurationParser(XslTransformer aTransformer) {
+ _transformer = aTransformer;
}
+ /**
+ * Parses the configuration from an input stream.
+ * @param aStream Input file.
+ * @return Configuration.
+ */
public Configuration parse(InputStream aStream) {
try {
SAXReader reader = new SAXReader();
}
/**
- * @param aRoot
- * @return
+ * Parses the URL-based configuration.
+ * @param aRoot Root of the configuration file document.
+ * @return List of URL-based configurations.
*/
private List<UrlConfig> parseUrlConfigs(Element aRoot) {
List<UrlConfig> configs = new ArrayList<UrlConfig>();
return configs;
}
+ /**
+ * Parses the page type based configurations.
+ * @param aRoot Root of the configuration file document.
+ * @return LIst of page type based configurations.
+ */
private List<PageTypeConfig> parsePageTypeConfigs(Element aRoot) {
List<PageTypeConfig> configs = new ArrayList<PageTypeConfig>();
for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) {
return configs;
}
+ /**
+ * Parses a URL-based configuration.
+ * @param aUrlElem Configuration element.
+ * @return Configuration.
+ */
private UrlConfig parseUrlConfig(Element aUrlElem) {
String pattern = aUrlElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aUrlElem);
return new UrlConfig(pattern, request);
}
+ /**
+ * Parses a page type based configuration.
+ * @param aTypeElem Configuration element.
+ * @return Configuration.
+ */
private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
String pattern = aTypeElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aTypeElem);
}
/**
- * @param aUrlElem
- * @return
+ * Parses a request configuration describing how to execute requests.
+ * @param aElem Configuration element.
+ * @return Page request.
*/
- private PageRequest parseRequestConfig(Element aUrlElem) {
- String method = aUrlElem.elementText(ELEM_METHOD);
- String xslt = aUrlElem.elementText(ELEM_XSLT);
- List<NameValuePair> params = new ArrayList<NameValuePair>();
- for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext();) {
- Element paramElem = (Element) i.next();
- NameValuePair param = parseParameter(paramElem);
- params.add(param);
- }
-
+ private PageRequest parseRequestConfig(Element aElem) {
+ String method = aElem.elementText(ELEM_METHOD);
+ String xslt = aElem.elementText(ELEM_XSLT);
+ List<NameValuePair> params = parseNameValuePairs(aElem, ELEM_PARAM);
+ List<NameValuePair> headers = parseNameValuePairs(aElem, ELEM_HEADER);
+
NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]);
+ NameValuePair[] headersArray = headers.toArray(new NameValuePair[0]);
PageRequest request;
if (METHOD_POST.equals(method)) {
- request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray,
- xslt, _os);
+ request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray,
+ xslt, _transformer);
} else if (METHOD_GET.equals(method) || method == null) {
- request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray,
- xslt, _os);
+ request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray,
+ xslt, _transformer);
} else {
throw new RuntimeException("Unknown request method '" + method
+ "'. Only " + METHOD_GET + " and " + METHOD_POST
return request;
}
+ /**
+ * @param aElem
+ * @return
+ */
+ private List<NameValuePair> parseNameValuePairs(Element aElem, String aElemName) {
+ List<NameValuePair> headers = new ArrayList<NameValuePair>();
+ for (Iterator i = aElem.elementIterator(aElemName); i.hasNext();) {
+ Element paramElem = (Element) i.next();
+ NameValuePair header = parseParameter(paramElem);
+ headers.add(header);
+ }
+ return headers;
+ }
+
+ /**
+ * Parses a parameter definition.
+ * @param aParam Parameter.
+ * @return Name value pair describing a parameter.
+ */
private NameValuePair parseParameter(Element aParam) {
String name = aParam.attributeValue(AT_NAME);
String value = aParam.attributeValue(AT_VALUE);