From 2c65c3ac67c3a5c4477f74cc3f2e5dcf34d950ef Mon Sep 17 00:00:00 2001 From: erik Date: Sun, 20 Aug 2006 15:53:57 +0000 Subject: [PATCH] slightly more robust XML parsing of the GPX track (elevation is now optional). DomUtils now uses XMLException instead of the 4 different types from before. --- .../basic/src/org/wamblee/crawler/Action.java | 1 - crawler/kiss/conf/kiss/programs.xml | 4 + gps/src/org/wamblee/gpx/GpxParser.java | 15 +- gps/src/org/wamblee/gpx/gpx.xsd | 788 ++++++++++++++++++ support/src/org/wamblee/xml/DomUtils.java | 166 +++- support/src/org/wamblee/xml/XMLException.java | 31 + 6 files changed, 955 insertions(+), 50 deletions(-) create mode 100644 gps/src/org/wamblee/gpx/gpx.xsd create mode 100644 support/src/org/wamblee/xml/XMLException.java diff --git a/crawler/basic/src/org/wamblee/crawler/Action.java b/crawler/basic/src/org/wamblee/crawler/Action.java index f24cacd0..cd9b4e2a 100644 --- a/crawler/basic/src/org/wamblee/crawler/Action.java +++ b/crawler/basic/src/org/wamblee/crawler/Action.java @@ -16,7 +16,6 @@ package org.wamblee.crawler; -import org.apache.commons.httpclient.NameValuePair; import org.dom4j.Element; /** diff --git a/crawler/kiss/conf/kiss/programs.xml b/crawler/kiss/conf/kiss/programs.xml index 38fb018d..a07c4c57 100644 --- a/crawler/kiss/conf/kiss/programs.xml +++ b/crawler/kiss/conf/kiss/programs.xml @@ -30,6 +30,10 @@ notify sf-|(sci-fi)|(science fiction) + + + invasion + notify diff --git a/gps/src/org/wamblee/gpx/GpxParser.java b/gps/src/org/wamblee/gpx/GpxParser.java index 500091b3..fe675ce6 100644 --- a/gps/src/org/wamblee/gpx/GpxParser.java +++ b/gps/src/org/wamblee/gpx/GpxParser.java @@ -16,29 +16,28 @@ package org.wamblee.gpx; -import java.io.IOException; import java.io.InputStream; import java.util.Iterator; -import javax.xml.parsers.ParserConfigurationException; - import org.dom4j.Document; import org.dom4j.Element; import org.wamblee.gps.track.Track; import org.wamblee.gps.track.TrackPoint; import org.wamblee.xml.DomUtils; -import org.xml.sax.SAXException; +import org.wamblee.xml.XMLException; /** * Parser for GPX tracks. */ public class GpxParser { + private static final String SCHEMA_RESOURCE = "gpx.xsd"; + public GpxParser() { // Empty. } - public Track parse(InputStream aIs) throws SAXException, ParserConfigurationException, IOException { + public Track parse(InputStream aIs) throws XMLException { Document doc = DomUtils.convert(DomUtils.read(aIs)); return parse(doc); } @@ -63,7 +62,11 @@ public class GpxParser { //System.out.println(trkpt.asXML() + "|\n"); double latitude = new Double(trkpt.attributeValue("lat")); double longitude = new Double(trkpt.attributeValue("lon")); - double elevation = new Double(trkpt.elementText("ele")); + Element ele = trkpt.element("ele"); + double elevation = 0.0; + if ( ele != null ) { + elevation = new Double(ele.getText()); + } //System.out.println(" lat = " + lat + " lon = " + lon + " ele = " + ele); return new TrackPoint(latitude, longitude, elevation); } diff --git a/gps/src/org/wamblee/gpx/gpx.xsd b/gps/src/org/wamblee/gpx/gpx.xsd new file mode 100644 index 00000000..0ce1605c --- /dev/null +++ b/gps/src/org/wamblee/gpx/gpx.xsd @@ -0,0 +1,788 @@ + + + + + + GPX schema version 1.1 - For more information on GPX and this schema, visit http://www.topografix.com/gpx.asp + + GPX uses the following conventions: all coordinates are relative to the WGS84 datum. All measurements are in metric units. + + + + + + + GPX is the root element in the XML file. + + + + + + + + GPX documents contain a metadata header, followed by waypoints, routes, and tracks. You can add your own elements + to the extensions section of the GPX document. + + + + + + + Metadata about the file. + + + + + + + A list of waypoints. + + + + + + + A list of routes. + + + + + + + A list of tracks. + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + + + You must include the version number in your GPX document. + + + + + + + You must include the name or URL of the software that created your GPX document. This allows others to + inform the creator of a GPX instance document that fails to validate. + + + + + + + + + Information about the GPX file, author, and copyright restrictions goes in the metadata section. Providing rich, + meaningful information about your GPX files allows others to search for and use your GPS data. + + + + + + + The name of the GPX file. + + + + + + + A description of the contents of the GPX file. + + + + + + + The person or organization who created the GPX file. + + + + + + + Copyright and license information governing use of the file. + + + + + + + URLs associated with the location described in the file. + + + + + + + The creation date of the file. + + + + + + + Keywords associated with the file. Search engines or databases can use this information to classify the data. + + + + + + + Minimum and maximum coordinates which describe the extent of the coordinates in the file. + + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + + + + wpt represents a waypoint, point of interest, or named feature on a map. + + + + + + + + Elevation (in meters) of the point. + + + + + + + Creation/modification timestamp for element. Date and time in are in Univeral Coordinated Time (UTC), not local time! Conforms to ISO 8601 specification for date/time representation. Fractional seconds are allowed for millisecond timing in tracklogs. + + + + + + + Magnetic variation (in degrees) at the point + + + + + + + Height (in meters) of geoid (mean sea level) above WGS84 earth ellipsoid. As defined in NMEA GGA message. + + + + + + + + + The GPS name of the waypoint. This field will be transferred to and from the GPS. GPX does not place restrictions on the length of this field or the characters contained in it. It is up to the receiving application to validate the field before sending it to the GPS. + + + + + + + GPS waypoint comment. Sent to GPS as comment. + + + + + + + A text description of the element. Holds additional information about the element intended for the user, not the GPS. + + + + + + + Source of data. Included to give user some idea of reliability and accuracy of data. "Garmin eTrex", "USGS quad Boston North", e.g. + + + + + + + Link to additional information about the waypoint. + + + + + + + Text of GPS symbol name. For interchange with other programs, use the exact spelling of the symbol as displayed on the GPS. If the GPS abbreviates words, spell them out. + + + + + + + Type (classification) of the waypoint. + + + + + + + + + Type of GPX fix. + + + + + + + Number of satellites used to calculate the GPX fix. + + + + + + + Horizontal dilution of precision. + + + + + + + Vertical dilution of precision. + + + + + + + Position dilution of precision. + + + + + + + Number of seconds since last DGPS update. + + + + + + + ID of DGPS station used in differential correction. + + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + + + The latitude of the point. Decimal degrees, WGS84 datum. + + + + + + + The latitude of the point. Decimal degrees, WGS84 datum. + + + + + + + + + rte represents route - an ordered list of waypoints representing a series of turn points leading to a destination. + + + + + + + GPS name of route. + + + + + + + GPS comment for route. + + + + + + + Text description of route for user. Not sent to GPS. + + + + + + + Source of data. Included to give user some idea of reliability and accuracy of data. + + + + + + + Links to external information about the route. + + + + + + + GPS route number. + + + + + + + Type (classification) of route. + + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + + A list of route points. + + + + + + + + + + trk represents a track - an ordered list of points describing a path. + + + + + + + GPS name of track. + + + + + + + GPS comment for track. + + + + + + + User description of track. + + + + + + + Source of data. Included to give user some idea of reliability and accuracy of data. + + + + + + + Links to external information about track. + + + + + + + GPS track number. + + + + + + + Type (classification) of track. + + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + + A Track Segment holds a list of Track Points which are logically connected in order. To represent a single GPS track where GPS reception was lost, or the GPS receiver was turned off, start a new Track Segment for each continuous span of track data. + + + + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + + + + A Track Segment holds a list of Track Points which are logically connected in order. To represent a single GPS track where GPS reception was lost, or the GPS receiver was turned off, start a new Track Segment for each continuous span of track data. + + + + + + + A Track Point holds the coordinates, elevation, timestamp, and metadata for a single point in a track. + + + + + + + + You can add extend GPX by adding your own elements from another schema here. + + + + + + + + + + Information about the copyright holder and any license governing use of this file. By linking to an appropriate license, + you may place your data into the public domain or grant additional usage rights. + + + + + + + Year of copyright. + + + + + + + Link to external file containing license text. + + + + + + + + Copyright holder (TopoSoft, Inc.) + + + + + + + + + A link to an external resource (Web page, digital photo, video clip, etc) with additional information. + + + + + + + Text of hyperlink. + + + + + + + Mime type of content (image/jpeg) + + + + + + + + URL of hyperlink. + + + + + + + + + An email address. Broken into two parts (id and domain) to help prevent email harvesting. + + + + + + id half of email address (billgates2004) + + + + + + + domain half of email address (hotmail.com) + + + + + + + + + A person or organization. + + + + + + + Name of person or organization. + + + + + + + Email address. + + + + + + + Link to Web site or other external information about person. + + + + + + + + + + A geographic point with optional elevation and time. Available for use by other schemas. + + + + + + + The elevation (in meters) of the point. + + + + + + + The time that the point was recorded. + + + + + + + + The latitude of the point. Decimal degrees, WGS84 datum. + + + + + + + The latitude of the point. Decimal degrees, WGS84 datum. + + + + + + + + + An ordered sequence of points. (for polygons or polylines, e.g.) + + + + + + + Ordered list of geographic points. + + + + + + + + + + Two lat/lon pairs defining the extent of an element. + + + + + + The minimum latitude. + + + + + + + The minimum longitude. + + + + + + + The maximum latitude. + + + + + + + The maximum longitude. + + + + + + + + + + The latitude of the point. Decimal degrees, WGS84 datum. + + + + + + + + + + + + The longitude of the point. Decimal degrees, WGS84 datum. + + + + + + + + + + + + Used for bearing, heading, course. Units are decimal degrees, true (not magnetic). + + + + + + + + + + + + Type of GPS fix. none means GPS had no fix. To signify "the fix info is unknown, leave out fixType entirely. pps = military signal used + + + + + + + + + + + + + + + Represents a differential GPS station. + + + + + + + + + diff --git a/support/src/org/wamblee/xml/DomUtils.java b/support/src/org/wamblee/xml/DomUtils.java index 05eae128..5651e341 100644 --- a/support/src/org/wamblee/xml/DomUtils.java +++ b/support/src/org/wamblee/xml/DomUtils.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.xml; @@ -26,9 +26,13 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; +import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.stream.StreamSource; +import javax.xml.validation.Schema; +import javax.xml.validation.SchemaFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -45,84 +49,160 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; +import com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl; +import com.sun.org.apache.xerces.internal.jaxp.validation.xs.SchemaFactoryImpl; + /** - * Some basic XML utilities for common reoccuring tasks for - * DOM documents. + * Some basic XML utilities for common reoccuring tasks for DOM documents. */ public final class DomUtils { - + private static final Log LOG = LogFactory.getLog(DomUtils.class); - + /** - * Disabled default constructor. - * + * Disabled default constructor. + * */ - private DomUtils() { - // Empty. + private DomUtils() { + // Empty. } - + /** - * Parses an XML document from a string. - * @param aDocument document. - * @return + * Parses an XML document from a string. + * + * @param aDocument + * document. + * @return */ - public static Document read(String aDocument) throws SAXException, ParserConfigurationException, IOException { + public static Document read(String aDocument) throws XMLException { ByteArrayInputStream is = new ByteArrayInputStream(aDocument.getBytes()); - return read(is); + return read(is); } - + /** - * Parses an XML document from a stream. - * @param aIs Input stream. + * Parses an XML document from a stream. + * + * @param aIs + * Input stream. * @return */ - public static Document read(InputStream aIs) throws SAXException, ParserConfigurationException, IOException { - DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); - return builder.parse(aIs); + public static Document read(InputStream aIs) throws XMLException { + try { + DocumentBuilder builder = DocumentBuilderFactory.newInstance() + .newDocumentBuilder(); + return builder.parse(aIs); + } catch (SAXException e) { + throw new XMLException(e.getMessage(), e); + } catch (IOException e) { + throw new XMLException(e.getMessage(), e); + } catch (ParserConfigurationException e) { + throw new XMLException(e.getMessage(), e); + } finally { + try { + aIs.close(); + } catch (Exception e) { + LOG.warn("Error closing XML file", e); + } + } } /** - * Serializes an XML document to a stream. - * @param aDocument Document to serialize. - * @param aOs Output stream. + * Reads and validates a document against a schema. + * + * @param aIs + * Input stream. + * @param aSchema + * Schema. + * @return Parsed and validated document. */ - public static void serialize(Document aDocument, OutputStream aOs) throws IOException { + public static Document readAndValidate(InputStream aIs, InputStream aSchema) + throws XMLException { + + try { + final Schema schema = SchemaFactory.newInstance( + XMLConstants.W3C_XML_SCHEMA_NS_URI).newSchema( + new StreamSource(aSchema)); + + final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setValidating(true); + factory.setNamespaceAware(true); + factory.setSchema(schema); + + return factory.newDocumentBuilder().parse(aIs); + } catch (SAXException e) { + throw new XMLException(e.getMessage(), e); + } catch (IOException e) { + throw new XMLException(e.getMessage(), e); + } catch (ParserConfigurationException e) { + throw new XMLException(e.getMessage(), e); + } finally { + try { + aSchema.close(); + } catch (Exception e) { + LOG.warn("Error closing schema", e); + } + try { + aIs.close(); + } catch (Exception e) { + LOG.warn("Error closing XML file", e); + } + } + + } + + /** + * Serializes an XML document to a stream. + * + * @param aDocument + * Document to serialize. + * @param aOs + * Output stream. + */ + public static void serialize(Document aDocument, OutputStream aOs) + throws IOException { XMLSerializer serializer = new XMLSerializer(aOs, new OutputFormat()); serializer.serialize(aDocument); } - + /** - * Serializes an XML document. - * @param aDocument Document to serialize. - * @return Serialized document. + * Serializes an XML document. + * + * @param aDocument + * Document to serialize. + * @return Serialized document. */ public static String serialize(Document aDocument) throws IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); - serialize(aDocument, os); - return os.toString(); + serialize(aDocument, os); + return os.toString(); } - + /** - * Converts a dom4j document into a w3c DOM document. - * @param aDocument Document to convert. - * @return W3C DOM document. + * Converts a dom4j document into a w3c DOM document. + * + * @param aDocument + * Document to convert. + * @return W3C DOM document. */ - public static Document convert(org.dom4j.Document aDocument) throws DocumentException { + public static Document convert(org.dom4j.Document aDocument) + throws DocumentException { return new DOMWriter().write(aDocument); } /** - * Converts a W3C DOM document into a dom4j document. - * @param aDocument Document to convert. + * Converts a W3C DOM document into a dom4j document. + * + * @param aDocument + * Document to convert. * @return Dom4j document. */ - public static org.dom4j.Document convert(Document aDocument) { - return new DOMReader().read(aDocument); + public static org.dom4j.Document convert(Document aDocument) { + return new DOMReader().read(aDocument); } - + /** - * Removes duplicate attributes from a DOM tree.This is useful for postprocessing the - * output of JTidy as a workaround for a bug in JTidy. + * Removes duplicate attributes from a DOM tree.This is useful for + * postprocessing the output of JTidy as a workaround for a bug in JTidy. * * @param aNode * Node to remove duplicate attributes from (recursively). diff --git a/support/src/org/wamblee/xml/XMLException.java b/support/src/org/wamblee/xml/XMLException.java new file mode 100644 index 00000000..53f1d5e2 --- /dev/null +++ b/support/src/org/wamblee/xml/XMLException.java @@ -0,0 +1,31 @@ +/* + * Copyright 2006 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.xml; + +/** + * Exception thrown in case of XML parsing problems. + */ +public class XMLException extends Exception { + + public XMLException(String aMsg) { + super(aMsg); + } + + public XMLException(String aMsg, Throwable aCause) { + super(aMsg, aCause); + } +} -- 2.31.1