import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
+import org.wamblee.io.FileResource;
import org.wamblee.xml.XSLT;
/**
private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class);
private static final String REDIRECT_HEADER = "Location";
+
+ private int _maxTries;
+ private int _maxDelay;
private NameValuePair[] _params;
private PrintStream _os;
- protected AbstractPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+ protected AbstractPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) {
if ( aParams == null ) {
throw new IllegalArgumentException("aParams is null");
}
if ( aXslt == null ) {
throw new IllegalArgumentException("aXslt is null");
}
+ _maxTries = aMaxTries;
+ _maxDelay = aMaxDelay;
_params = aParams;
_xslt = aXslt;
_os = aOs;
protected NameValuePair[] getParameters() {
return _params;
}
+
+ protected Document executeMethod(HttpClient client, HttpMethod method) throws TransformerException {
+ int triesLeft = _maxTries;
+ while ( triesLeft > 0 ) {
+ triesLeft--;
+ try {
+ return executeMethodWithoutRetries(client, method);
+ } catch (TransformerException e) {
+ if ( triesLeft == 0 ) {
+ throw e;
+ }
+ }
+ }
+ throw new RuntimeException("Code should never reach this point");
+ }
+
- protected Document executeMethod(HttpClient client, HttpMethod method) {
+ protected Document executeMethodWithoutRetries(HttpClient client, HttpMethod method) throws TransformerException {
try {
// Execute the method.
method = executeWithRedirects(client, method);
}
xhtml.flush();
byte[] xhtmlData = xhtml.toByteArray();
- Document transformed = XSLT.transform(xhtmlData, new File(_xslt));
+ Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt)));
_os.println("Transformed result is: ");
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setParameter(OutputKeys.INDENT, "yes");
transformer.transform(new DOMSource(transformed), new StreamResult(_os));
return transformed;
- } catch (Exception e) {
+ } catch (HttpException e) {
+ throw new RuntimeException(e.getMessage(), e);
+ } catch (IOException e) {
+ throw new RuntimeException(e.getMessage(), e);
+ } catch (TransformerConfigurationException e) {
throw new RuntimeException(e.getMessage(), e);
} finally {
// Release the connection.
method.releaseConnection();
}
}
+
+ private void delay() {
+ try {
+ Thread.sleep((long)((float)_maxDelay* Math.random()));
+ } catch (InterruptedException e) {
+ //
+ }
+ }
+
/**
* @param aClient
* @throws HttpException
*/
private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException {
+ delay();
int statusCode = aClient.executeMethod(aMethod);
switch (statusCode) {
* Executes the action.
* @return
*/
- Page execute();
+ Page execute() throws PageException;
/**
* Gets a description of the action. THe element returned is the action element
* @param aUrl Url of page.
* @return Page to retrieve.
*/
- Page getPage(String aUrl);
+ Page getPage(String aUrl) throws PageException;
/**
* Gets the content for a specific page.
* @param aType Type of page.
* @return Page.
*/
- Page getPage(String aUrl, PageType aType);
+ Page getPage(String aUrl, PageType aType) throws PageException;
}
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler;
import java.io.PrintStream;
+import javax.xml.transform.TransformerException;
+
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.NameValuePair;
import org.w3c.dom.Document;
/**
- * Gets a page by issueing a get request.
+ * Gets a page by issueing a get request.
*/
public class GetPageRequest extends AbstractPageRequest {
-
- public GetPageRequest(NameValuePair[] aParams, String aXslt) {
- super(aParams, aXslt, null);
+
+ public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
+ super(aMaxTries, aMaxDelay, aParams, aXslt, null);
}
-
- public GetPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
- super(aParams, aXslt, aOs);
+
+ public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+ super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
}
-
- /* (non-Javadoc)
+ /*
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.PageRequest#getPage(org.apache.commons.httpclient.HttpClient)
*/
- public Document execute(String aUrl, HttpClient aClient) {
+ public Document execute(String aUrl, HttpClient aClient)
+ throws PageException {
HttpMethod method = new GetMethod(aUrl);
- if ( getParameters().length > 0 ) {
+ if (getParameters().length > 0) {
String oldQueryString = method.getQueryString();
method.setQueryString(getParameters());
String queryString = method.getQueryString();
- if ( oldQueryString.length() > 0 ) {
+ if (oldQueryString.length() > 0) {
queryString = queryString + '&' + oldQueryString;
method.setQueryString(queryString);
}
}
-
- return executeMethod(aClient, method);
+ try {
+ return executeMethod(aClient, method);
+ } catch (TransformerException e) {
+ throw new PageException(e.getMessage(), e);
+ }
}
}
* @param aClient Http client to use.
* @return Client.
*/
- Document execute(String aUrl, HttpClient aClient);
+ Document execute(String aUrl, HttpClient aClient) throws PageException;
/**
* Overrides the Xslt to use.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler;
import java.io.PrintStream;
+import javax.xml.transform.TransformerException;
+
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
* Retrieving pages using the post method.
*/
public class PostPageRequest extends AbstractPageRequest {
-
- public PostPageRequest(NameValuePair[] aParams, String aXslt) {
- super(aParams, aXslt, null);
+
+ public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) {
+ super(aMaxTries, aMaxDelay, aParams, aXslt, null);
}
-
- public PostPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
- super(aParams, aXslt, aOs);
+
+ public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt,
+ PrintStream aOs) {
+ super(aMaxTries, aMaxDelay, aParams, aXslt, aOs);
}
-
-
- /* (non-Javadoc)
- * @see org.wamblee.crawler.PageRequest#execute(java.lang.String, org.apache.commons.httpclient.HttpClient)
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.wamblee.crawler.PageRequest#execute(java.lang.String,
+ * org.apache.commons.httpclient.HttpClient)
*/
- public Document execute(String aUrl, HttpClient aClient) {
+ public Document execute(String aUrl, HttpClient aClient)
+ throws PageException {
PostMethod method = new PostMethod(aUrl);
method.addParameters(getParameters());
- return executeMethod(aClient, method);
+ try {
+ return executeMethod(aClient, method);
+ } catch (TransformerException e) {
+ throw new PageException(e.getMessage(), e);
+ }
}
}
import org.wamblee.crawler.Action;
import org.wamblee.crawler.Crawler;
import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
import org.wamblee.crawler.PageType;
/**
/* (non-Javadoc)
* @see org.wamblee.crawler.Action#execute()
*/
- public Page execute() {
+ public Page execute() throws PageException {
if ( _type == null) {
return _crawler.getPage(_reference);
}
import java.io.PrintStream;
import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.dom4j.Element;
import org.wamblee.crawler.Action;
import org.wamblee.crawler.Configuration;
import org.wamblee.crawler.Crawler;
import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
/*
* Copyright 2005 the original author or authors.
*/
public class App {
+ private static final Log LOG = LogFactory.getLog(App.class);
+
private static final String LOG_FILE = "crawler.log";
public static void main(String[] args) throws Exception {
String configFileName = args[0];
String starturl = args[1];
-
+
FileOutputStream fos = new FileOutputStream(new File(LOG_FILE));
PrintStream os = new PrintStream(fos);
HttpClient client = new HttpClient();
// client.getHostConfiguration().setProxy("localhost", 3128);
-
+
Crawler crawler = new CrawlerImpl(client, config);
System.out.println("Retrieving: " + starturl);
*/
private static void showPage(Page aPage) {
Action[] links = aPage.getActions();
- for (Action link: links) {
+ for (Action link : links) {
System.out.println("Link found '" + link.getName() + "'");
}
- Element element = aPage.getContent();
+ Element element = aPage.getContent();
System.out.println("Retrieved content: " + element.asXML());
}
-
- private static void recordInterestingShows(Page page) {
+
+ private static void recordInterestingShows(Page page) throws PageException {
Action[] channels = page.getActions();
- for (Action channel: channels) {
- examineChannel(channel.getName(), channel.execute().getAction("right-now").execute());
+ for (Action channel : channels) {
+ examineChannel(channel.getName(), channel.execute().getAction(
+ "right-now").execute());
}
}
-
- private static void examineChannel(String aChannel, Page aPage) {
- Action[] programs = aPage.getActions();
- for (Action program: programs) {
+
+ private static void examineChannel(String aChannel, Page aPage) throws PageException {
+ Action[] programs = aPage.getActions();
+ for (Action program : programs) {
System.out.println(aChannel + " - " + program.getName());
- if ( program.getName().toLowerCase().matches(".*babe.*")) {
+ if (program.getName().toLowerCase().matches(".*babe.*")) {
Page programPage = program.execute();
- Action record = programPage.getAction("record");
+ Action record = programPage.getAction("record");
System.out.println("Recording possible: " + record != null);
}
}
private static final String METHOD_POST = "post";
private static final String METHOD_GET = "get";
+ private static final int MAX_TRIES = 3;
+ private static final int MAX_DELAY = 5000;
+
private PrintStream _os;
public ConfigurationParser(PrintStream aOs) {
NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]);
PageRequest request;
if ( METHOD_POST.equals(method)) {
- request = new PostPageRequest(paramsArray, xslt, _os);
+ request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
}
else if ( METHOD_GET.equals(method) || method == null ){
- request = new GetPageRequest(paramsArray, xslt, _os);
+ request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
} else {
throw new RuntimeException("Unknown request method '" + method + "'. Only " +
METHOD_GET + " and " + METHOD_POST + " are supported");
import org.wamblee.crawler.Configuration;
import org.wamblee.crawler.Crawler;
import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
import org.wamblee.crawler.PageRequest;
import org.wamblee.crawler.PageType;
public class CrawlerImpl implements Crawler {
private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
+ private static final int MAX_DELAY = 5000;
private HttpClient _client;
- private Configuration _config;
+ private Configuration _config;
public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
_client = aClient;
* (non-Javadoc)
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
*/
- public Page getPage(String aUrl) {
+ public Page getPage(String aUrl) throws PageException {
LOG.info("Getting page: url = '" + aUrl + "'");
PageRequest request = _config.getRequest(aUrl);
Document content = request.execute(aUrl, _client);
/* (non-Javadoc)
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
*/
- public Page getPage(String aUrl, PageType aType) {
+ public Page getPage(String aUrl, PageType aType) throws PageException {
LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
PageRequest request = _config.getRequest(aType);
Document content = request.execute(aUrl, _client);
return transformToDom4jDoc(content);
}
-
+
/**
* @param aUrl
* @param request
&header;
<target name="module.build.deps"
- depends="logging.d,commons-codec.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d">
+ depends="logging.d,mail.d,commons-codec.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d">
</target>
<!-- Set libraries to use in addition for test, a library which
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileNotFoundException;
import java.io.FileOutputStream;
+import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.ArrayList;
+import java.util.Date;
import java.util.List;
+import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javax.mail.Message;
+import javax.mail.MessagingException;
+import javax.mail.Session;
+import javax.mail.Transport;
+import javax.mail.internet.AddressException;
+import javax.mail.internet.InternetAddress;
+import javax.mail.internet.MimeMessage;
+
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Element;
import org.wamblee.conditions.Condition;
-import org.wamblee.conditions.OrCondition;
import org.wamblee.crawler.Action;
import org.wamblee.crawler.Configuration;
import org.wamblee.crawler.Crawler;
import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
import org.wamblee.crawler.impl.ConfigurationParser;
import org.wamblee.crawler.impl.CrawlerImpl;
*
*/
public class KissCrawler {
-
+
private static final Log LOG = LogFactory.getLog(KissCrawler.class);
private static final String LOG_FILE = "kiss.log";
private static final String START_URL = "http://epg.kml.kiss-technology.com/login_core.php";
private static final String CRAWLER_CONFIG = "config.xml";
-
+
private static final String PROGRAM_CONFIG = "programs.xml";
private static final String TIME_REGEX = "([0-9]{2}):([0-9]{2})[^0-9]*([0-9]{2}):([0-9]{2}).*";
private Pattern _pattern;
- public KissCrawler(String aStartUrl, String aCrawlerConfig, String aProgramConfig) throws Exception {
+ public KissCrawler(String aStartUrl, String aCrawlerConfig,
+ String aProgramConfig) throws IOException, AddressException,
+ MessagingException {
_pattern = Pattern.compile(TIME_REGEX);
PrintStream os = new PrintStream(fos);
try {
- ConfigurationParser parser = new ConfigurationParser(os);
- InputStream crawlerConfigFile = new FileInputStream(new File(aCrawlerConfig));
- Configuration config = parser.parse(crawlerConfigFile);
-
- InputStream programConfigFile = new FileInputStream(new File(aProgramConfig));
- Condition<Program> programCondition = new ProgramConfigurationParser().parse(programConfigFile);
-
-
HttpClient client = new HttpClient();
// client.getHostConfiguration().setProxy("localhost", 3128);
- Crawler crawler = new CrawlerImpl(client, config);
+ Crawler crawler = createCrawler(aCrawlerConfig, os, client);
- Page page = crawler.getPage(aStartUrl);
- showPage(page);
- page = page.getAction("channels-favorites").execute();
+ Page page = getStartPage(aStartUrl, crawler);
TVGuide guide = createGuide(page);
PrintVisitor printer = new PrintVisitor(System.out);
guide.accept(printer);
-
- MatchVisitor matcher = new MatchVisitor(programCondition);
- guide.accept(matcher);
- List<Program> programs = matcher.getMatches();
- for (Program program: programs) {
- System.out.println("Found: " + program + " record: " + program.record() );
- }
-
+
+ InputStream programConfigFile = new FileInputStream(new File(
+ aProgramConfig));
+ Condition<Program> programCondition = new ProgramConfigurationParser()
+ .parse(programConfigFile);
+ recordInterestingShows(programCondition, guide);
} finally {
os.flush();
os.close();
}
}
+ /**
+ * @param programCondition
+ * @param guide
+ * @throws AddressException
+ * @throws MessagingException
+ */
+ private void recordInterestingShows(Condition<Program> programCondition,
+ TVGuide guide) throws AddressException, MessagingException {
+ MatchVisitor matcher = new MatchVisitor(programCondition);
+ guide.accept(matcher);
+ List<Program> programs = matcher.getMatches();
+ String recorded = "";
+ String notRecorded = "";
+ String failures = "";
+ for (Program program : programs) {
+ try {
+ boolean result = program.record();
+ if (result) {
+ recorded += "\n" + program;
+ } else {
+ notRecorded += "\n" + program;
+ }
+ } catch (PageException e) {
+ LOG.info("Attempt to record " + program + " failed.");
+ failures += "\n" + program.toString() + ": " + e.getMessage();
+ }
+ }
+ String msg = "Summary of KiSS crawler: \n\n\n";
+
+ if (recorded.length() > 0) {
+ msg += "Recorded programs:\n\n" + recorded + "\n\n";
+ }
+ if (notRecorded.length() > 0) {
+ msg += "Not recorded programs:\n\n" + notRecorded + "\n\n";
+ }
+ if (recorded.length() == 0 && notRecorded.length() == 0) {
+ msg += "No suitable programs found";
+ }
+ if (failures.length() > 0) {
+ msg += "Failures:\n\n" + failures;
+ }
+ System.out.println(msg);
+ sendMail(msg);
+ }
+
+ /**
+ * @param aCrawlerConfig
+ * @param os
+ * @param client
+ * @return
+ * @throws FileNotFoundException
+ */
+ private Crawler createCrawler(String aCrawlerConfig, PrintStream os,
+ HttpClient client) throws FileNotFoundException {
+ ConfigurationParser parser = new ConfigurationParser(os);
+ InputStream crawlerConfigFile = new FileInputStream(new File(
+ aCrawlerConfig));
+ Configuration config = parser.parse(crawlerConfigFile);
+ Crawler crawler = new CrawlerImpl(client, config);
+ return crawler;
+ }
+
+ /**
+ * @param aStartUrl
+ * @param crawler
+ * @return
+ */
+ private Page getStartPage(String aStartUrl, Crawler crawler) {
+ try {
+ Page page = crawler.getPage(aStartUrl);
+ return page.getAction("channels-favorites").execute();
+ } catch (PageException e) {
+ throw new RuntimeException(
+ "Could not login to electronic program guide", e);
+ }
+ }
+
public static void main(String[] args) throws Exception {
new KissCrawler(START_URL, CRAWLER_CONFIG, PROGRAM_CONFIG);
}
Action[] actions = page.getActions();
List<Channel> channels = new ArrayList<Channel>();
for (Action action : actions) {
- Channel channel = createChannel(action.getName(), action.execute()
- .getAction("right-now").execute());
- channels.add(channel);
+ try {
+ LOG.info("Getting channel info for '" + action.getName() + "'");
+ Channel channel = createChannel(action.getName(), action
+ .execute().getAction("right-now").execute());
+ channels.add(channel);
+ } catch (PageException e) {
+ LOG.error("Could not create channel information for '"
+ + action.getName() + "'", e);
+ }
}
return new TVGuide(channels);
}
String time = action.getContent().element("time").getText().trim();
Matcher matcher = _pattern.matcher(time);
if (matcher.matches()) {
- Time begin = new Time(Integer.parseInt(matcher.group(1)),
- Integer.parseInt(matcher.group(2)));
- Time end = new Time(Integer.parseInt(matcher.group(3)),
- Integer.parseInt(matcher.group(4)));
+ Time begin = new Time(Integer.parseInt(matcher.group(1)),
+ Integer.parseInt(matcher.group(2)));
+ Time end = new Time(Integer.parseInt(matcher.group(3)), Integer
+ .parseInt(matcher.group(4)));
TimeInterval interval = new TimeInterval(begin, end);
- //Page programInfo = action.execute();
- //String description = programInfo.getContent().element("description").getText().trim();
- //String keywords = programInfo.getContent().element("keywords").getText().trim();
+ // Page programInfo = action.execute();
+ // String description =
+ // programInfo.getContent().element("description").getText().trim();
+ // String keywords =
+ // programInfo.getContent().element("keywords").getText().trim();
String description = "";
String keywords = "";
- Program program = new Program(aChannel, action.getName(), description, keywords, interval, action);
-
+ Program program = new Program(aChannel, action.getName(),
+ description, keywords, interval, action);
+
LOG.debug("Got program " + program);
programs.add(program);
}
}
return new Channel(aChannel, programs);
}
+
+ private void sendMail(String aText) throws AddressException,
+ MessagingException {
+ Properties props = new Properties();
+ props.put("mail.transport.protocol", "smtp");
+ props.put("mail.smtp.host", "falcon");
+ props.put("mail.smtp.port", "25");
+
+ Session mailSession = Session.getInstance(props);
+ Message message = new MimeMessage(mailSession);
+
+ message.setFrom(new InternetAddress("erik@brakkee.org"));
+ message.setRecipient(Message.RecipientType.TO, new InternetAddress(
+ "erik@brakkee.org"));
+ message.setSentDate(new Date());
+ message.setSubject("KiSS crawler update");
+ message.setText(aText);
+ Transport.send(message);
+ }
}
package org.wamblee.crawler.kiss;
import org.wamblee.crawler.Action;
+import org.wamblee.crawler.PageException;
/**
*
return _interval;
}
- public boolean record() {
+ public boolean record() throws PageException {
Action record = _programInfo.execute().getAction(RECORD_ACTION);
if ( record == null) {
return false;