X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;h=098ed91f42ee30d072b2ce5d5339e37d3e8a2726;hb=f53c06ddca33e21e772c479179b7f858a3a8b8d4;hp=8db31606fc476e419cde8a4055a5286ad8f2c324;hpb=81bc61121a8f17f754fc99eb66603a59df242ddc;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java index 8db31606..098ed91f 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -12,11 +12,12 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.impl; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Element; @@ -25,67 +26,83 @@ import org.w3c.dom.Document; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; /** - * Crawler implementation. + * Crawler implementation. */ public class CrawlerImpl implements Crawler { - + private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); - - private HttpClient _client; - private Configuration _config; - + + private HttpClient _client; + + private Configuration _config; + + /** + * Constructs the crawler. + * + * @param aClient + * Http client to use. + * @param aConfig + * Configuration. + */ public CrawlerImpl(HttpClient aClient, Configuration aConfig) { - _client = aClient; - _config = aConfig; + _client = aClient; + _config = aConfig; } /* - * (non-Javadoc) + * (non-Javadoc) + * * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ - public Page getPage(String aUrl) { - LOG.info("Getting page: url = '" + aUrl + "'"); + public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException { + LOG.debug("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); - Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + Document content = request.execute(aUrl, aParams, _client); + return transformToDom4jDoc(aUrl, content); } - - /* (non-Javadoc) - * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, + * java.lang.String) */ - public Page getPage(String aUrl, PageType aType) { - LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); + public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException { + LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); - Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + Document content = request.execute(aUrl, aParams, _client); + return transformToDom4jDoc(aUrl, content); } /** - * @param aUrl - * @param request + * Converts a w3c DOM document to a page object. + * @param content DOM document. * @return */ - private Page transformToDom4jDoc(Document content) { - + private Page transformToDom4jDoc(String aUrl, Document content) { DOMReader reader = new DOMReader(); org.dom4j.Document dom4jDoc = reader.read(content); Element root = dom4jDoc.getRootElement(); dom4jDoc.remove(root); - - return new PageImpl(this, replaceReferencesWithContent(root)); + + return new PageImpl(aUrl, this, replaceReferencesWithContent(root)); } - + /** - * Perform crawling. Find references in the retrieved content and replace them - * by the content they refer to by retrieving the appropriate pages as well. - * @param content Content which must be made complete. - * @return Fully processed content. + * Perform crawling. Find references in the retrieved content and replace + * them by the content they refer to by retrieving the appropriate pages as + * well. + * + * @param content + * Content which must be made complete. + * @return Fully processed content. */ - private Element replaceReferencesWithContent(Element content) { - return content; // TODO implement. + private Element replaceReferencesWithContent(Element content) { + return content; // TODO implement. } }