X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2FAbstractPageRequest.java;h=73132aa89736e8d06abd9a0e2cd1a310a4a96fa1;hb=a5a9deb2dedb2efc96972acedaa44909a3b0fd79;hp=63764f53439d2c6ce836cb437ad2db100a4792f7;hpb=38796888b1520bc413806488d406f3e9442d0d4b;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 63764f53..73132aa8 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -23,6 +23,8 @@ import java.io.PrintStream; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; @@ -38,6 +40,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; +import org.wamblee.io.FileResource; import org.wamblee.xml.XSLT; /** @@ -47,6 +50,9 @@ public abstract class AbstractPageRequest implements PageRequest { private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class); private static final String REDIRECT_HEADER = "Location"; + + private int _maxTries; + private int _maxDelay; private NameValuePair[] _params; @@ -54,13 +60,15 @@ public abstract class AbstractPageRequest implements PageRequest { private PrintStream _os; - protected AbstractPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { + protected AbstractPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) { if ( aParams == null ) { throw new IllegalArgumentException("aParams is null"); } if ( aXslt == null ) { throw new IllegalArgumentException("aXslt is null"); } + _maxTries = aMaxTries; + _maxDelay = aMaxDelay; _params = aParams; _xslt = aXslt; _os = aOs; @@ -76,8 +84,24 @@ public abstract class AbstractPageRequest implements PageRequest { protected NameValuePair[] getParameters() { return _params; } + + protected Document executeMethod(HttpClient client, HttpMethod method) throws TransformerException { + int triesLeft = _maxTries; + while ( triesLeft > 0 ) { + triesLeft--; + try { + return executeMethodWithoutRetries(client, method); + } catch (TransformerException e) { + if ( triesLeft == 0 ) { + throw e; + } + } + } + throw new RuntimeException("Code should never reach this point"); + } + - protected Document executeMethod(HttpClient client, HttpMethod method) { + protected Document executeMethodWithoutRetries(HttpClient client, HttpMethod method) throws TransformerException { try { // Execute the method. method = executeWithRedirects(client, method); @@ -106,7 +130,7 @@ public abstract class AbstractPageRequest implements PageRequest { } xhtml.flush(); byte[] xhtmlData = xhtml.toByteArray(); - Document transformed = XSLT.transform(xhtmlData, new File(_xslt)); + Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt))); _os.println("Transformed result is: "); Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setParameter(OutputKeys.INDENT, "yes"); @@ -114,13 +138,26 @@ public abstract class AbstractPageRequest implements PageRequest { transformer.transform(new DOMSource(transformed), new StreamResult(_os)); return transformed; - } catch (Exception e) { + } catch (HttpException e) { + throw new RuntimeException(e.getMessage(), e); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } catch (TransformerConfigurationException e) { throw new RuntimeException(e.getMessage(), e); } finally { // Release the connection. method.releaseConnection(); } } + + private void delay() { + try { + Thread.sleep((long)((float)_maxDelay* Math.random())); + } catch (InterruptedException e) { + // + } + } + /** * @param aClient @@ -129,6 +166,7 @@ public abstract class AbstractPageRequest implements PageRequest { * @throws HttpException */ private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException { + delay(); int statusCode = aClient.executeMethod(aMethod); switch (statusCode) {