From 3e42f5eea742563cdeab7d655fe55f7d0e25ea16 Mon Sep 17 00:00:00 2001 From: Nicholas Car Date: Fri, 14 Aug 2020 00:07:44 +1000 Subject: [PATCH] improved Graph().parse() --- rdflib/extras/describer.py | 2 +- rdflib/graph.py | 34 +++++++++------ rdflib/util.py | 2 +- test/rdf/datatypes/test001.borked | 29 +++++++++++++ test/test_graph.py | 63 +++++++++++++++++++++++++++- test/test_issue247.py | 4 +- test/test_issue363.py | 2 +- test/test_issue_git_336.py | 2 +- test/test_literal.py | 2 +- test/test_parse_file_guess_format.py | 7 ++-- test/test_parser.py | 1 + test/test_seq.py | 2 +- test/test_xmlliterals.py | 2 +- 13 files changed, 125 insertions(+), 27 deletions(-) create mode 100644 test/rdf/datatypes/test001.borked diff --git a/rdflib/extras/describer.py b/rdflib/extras/describer.py index cec3b6020..8afce1280 100644 --- a/rdflib/extras/describer.py +++ b/rdflib/extras/describer.py @@ -102,7 +102,7 @@ ... ... ... - ... ''') + ... ''', format="xml") >>> >>> from rdflib.compare import isomorphic >>> isomorphic(person_graph, expected) #doctest: +SKIP diff --git a/rdflib/graph.py b/rdflib/graph.py index 145224b8f..786f193c0 100644 --- a/rdflib/graph.py +++ b/rdflib/graph.py @@ -24,6 +24,7 @@ from rdflib.resource import Resource from rdflib.collection import Collection import rdflib.util # avoid circular dependency +from rdflib.exceptions import ParserError import os import shutil @@ -996,7 +997,7 @@ def parse( **args ): """ - Parse source adding the resulting triples to the Graph. + Parse an RDF source adding the resulting triples to the Graph. The source is specified using one of source, location, file or data. @@ -1010,9 +1011,10 @@ def parse( is specified. - `file`: A file-like object. - `data`: A string containing the data to be parsed. - - `format`: Used if format can not be determined from source. - Defaults to rdf/xml. Format support can be extended with plugins, - but "xml", "n3", "nt" & "trix" are built in. + - `format`: Used if format can not be determined from source, e.g. file + extension or Media Type. Defaults to text/turtle. Format support can + be extended with plugins, but "xml", "n3" (use for turtle), "nt" & + "trix" are built in. - `publicID`: the logical URI to use as the document base. If None specified the document location is used (at least in the case where there is a document location). @@ -1058,6 +1060,11 @@ def parse( >>> os.remove(file_name) + >>> # default turtle parsing + >>> result = g.parse(data=" .") + >>> len(g) + 3 + """ source = create_input_source( @@ -1070,24 +1077,25 @@ def parse( ) if format is None: format = source.content_type - assumed_xml = False + could_not_guess_format = False if format is None: if (hasattr(source, "file") and getattr(source.file, "name", None) and isinstance(source.file.name, str)): format = rdflib.util.guess_format(source.file.name) if format is None: - format = "application/rdf+xml" - assumed_xml = True + format = "turtle" + could_not_guess_format = True parser = plugin.get(format, Parser)() try: parser.parse(source, self, **args) - except SAXParseException as saxpe: - if assumed_xml: - logger.warning( - "Could not guess format for %r, so assumed xml." - " You can explicitly specify format using the format argument." % source) - raise saxpe + except SyntaxError as se: + if could_not_guess_format: + raise ParserError( + "Could not guess RDF format for %r from file extension so tried Turtle but failed." + "You can explicitly specify format using the format argument." % source) + else: + raise se finally: if source.auto_close: source.close() diff --git a/rdflib/util.py b/rdflib/util.py index 92996ec79..c5f1ff09b 100644 --- a/rdflib/util.py +++ b/rdflib/util.py @@ -352,8 +352,8 @@ def parse_date_time(val): SUFFIX_FORMAT_MAP = { + "xml": "xml", "rdf": "xml", - "rdfs": "xml", "owl": "xml", "n3": "n3", "ttl": "turtle", diff --git a/test/rdf/datatypes/test001.borked b/test/rdf/datatypes/test001.borked new file mode 100644 index 000000000..a4c86aea7 --- /dev/null +++ b/test/rdf/datatypes/test001.borked @@ -0,0 +1,29 @@ + + + + + + + + 10 + 10 + + + diff --git a/test/test_graph.py b/test/test_graph.py index 0032213e6..86b913eab 100644 --- a/test/test_graph.py +++ b/test/test_graph.py @@ -5,7 +5,9 @@ from tempfile import mkdtemp, mkstemp import shutil -from rdflib import URIRef, RDF, Graph, plugin +from rdflib import URIRef, Graph, plugin +from rdflib.exceptions import ParserError +from rdflib.plugin import PluginException from nose.exc import SkipTest @@ -248,6 +250,65 @@ def testGraphIntersection(self): self.assertEqual((michel, likes, cheese) in g1, True) + def testGuessFormatForParse(self): + self.graph = Graph() + + # files + with self.assertRaises(ParserError): + self.graph.parse(__file__) # here we are trying to parse a Python file!! + + # .nt can be parsed by Turtle Parser + self.graph.parse("test/nt/anons-01.nt") + # RDF/XML + self.graph.parse("test/rdf/datatypes/test001.rdf") # XML + # bad filename but set format + self.graph.parse("test/rdf/datatypes/test001.borked", format="xml") + + # strings + self.graph = Graph() + + with self.assertRaises(ParserError): + self.graph.parse(data="rubbish") + + # Turtle - default + self.graph.parse(data=" .") + + # Turtle - format given + self.graph.parse(data=" .", format="turtle") + + # RDF/XML - format given + rdf = """ + + + + + + + + + + + + + + """ + self.graph.parse(data=rdf, format="xml") + + # URI + self.graph = Graph() + + # only getting HTML + with self.assertRaises(PluginException): + self.graph.parse(location="https://www.google.com") + + self.graph.parse(location="http://www.w3.org/ns/adms.ttl") + self.graph.parse(location="http://www.w3.org/ns/adms.rdf") + # persistent Australian Government online RDF resource without a file-like ending + self.graph.parse(location="https://linked.data.gov.au/def/agrif?_format=text/turtle") + # dynamically create classes for each registered Store diff --git a/test/test_issue247.py b/test/test_issue247.py index 747dd1e06..7a51dd24e 100644 --- a/test/test_issue247.py +++ b/test/test_issue247.py @@ -38,7 +38,7 @@ def test_successful_parse_of_literal_without_xmllang_attr(self): it contains a XML Literal with a xml:lang attribute: """ g = rdflib.Graph() - g.parse(data=passxml) + g.parse(data=passxml, format="xml") def test_failing_parse_of_literal_with_xmllang_attr(self): """ @@ -47,7 +47,7 @@ def test_failing_parse_of_literal_with_xmllang_attr(self): it contains a XML Literal with a xml:lang attribute: """ g = rdflib.Graph() - g.parse(data=failxml) + g.parse(data=failxml, format="xml") if __name__ == "__main__": diff --git a/test/test_issue363.py b/test/test_issue363.py index 792c2441e..5f88a6f40 100644 --- a/test/test_issue363.py +++ b/test/test_issue363.py @@ -38,7 +38,7 @@ def p(): def test_parsetype_resource(): - g = rdflib.Graph().parse(data=data2) + g = rdflib.Graph().parse(data=data2, format="xml") print(g.serialize(format="n3")) diff --git a/test/test_issue_git_336.py b/test/test_issue_git_336.py index 6a8abb7c3..c3d4a5810 100644 --- a/test/test_issue_git_336.py +++ b/test/test_issue_git_336.py @@ -37,7 +37,7 @@ def test_ns_localname_roundtrip(): xmldump = g.serialize().decode("utf-8") g1 = rdflib.Graph() - g1.parse(data=xmldump) + g1.parse(data=xmldump, format="xml") g1.parse(data=turtledump, format="turtle") diff --git a/test/test_literal.py b/test/test_literal.py index 8124f99d8..bc6919b70 100644 --- a/test/test_literal.py +++ b/test/test_literal.py @@ -33,7 +33,7 @@ def test_backslash(self): """ g = rdflib.Graph() - g.parse(data=d) + g.parse(data=d, format="xml") a = rdflib.Literal("a\\b") b = list(g.objects())[0] self.assertEqual(a, b) diff --git a/test/test_parse_file_guess_format.py b/test/test_parse_file_guess_format.py index abb039df3..5706f8df1 100644 --- a/test/test_parse_file_guess_format.py +++ b/test/test_parse_file_guess_format.py @@ -3,7 +3,7 @@ from shutil import copyfile from tempfile import TemporaryDirectory -from xml.sax import SAXParseException +from rdflib.exceptions import ParserError from rdflib import Graph, logger as graph_logger @@ -21,11 +21,10 @@ def test_warning(self): g = Graph() with TemporaryDirectory() as tmpdirname: newpath = Path(tmpdirname).joinpath("no_file_ext") - copyfile("test/w3c/turtle/IRI_subject.ttl", str(newpath)) + copyfile("test/rdf/Manifest.rdf", str(newpath)) with self.assertLogs(graph_logger, "WARNING") as log_cm: - with self.assertRaises(SAXParseException): + with self.assertRaises(ParserError): g.parse(str(newpath)) - self.assertTrue(any("Could not guess format" in msg for msg in log_cm.output)) if __name__ == '__main__': diff --git a/test/test_parser.py b/test/test_parser.py index 3aaf56584..e337969ca 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -33,6 +33,7 @@ def testNoPathWithHash(self): """, + format="xml", publicID="http://example.org", ) diff --git a/test/test_seq.py b/test/test_seq.py index 7f1775749..5a987ef45 100644 --- a/test/test_seq.py +++ b/test/test_seq.py @@ -29,7 +29,7 @@ class SeqTestCase(unittest.TestCase): def setUp(self): store = self.store = Graph(store=self.backend) store.open(self.path) - store.parse(data=s) + store.parse(data=s, format="xml") def tearDown(self): self.store.close() diff --git a/test/test_xmlliterals.py b/test/test_xmlliterals.py index fcc0ddf2e..aeabbe888 100644 --- a/test/test_xmlliterals.py +++ b/test/test_xmlliterals.py @@ -42,7 +42,7 @@ def testRDFXMLParse(): """ g = rdflib.Graph() - g.parse(data=rdfxml) + g.parse(data=rdfxml, format="xml") l1 = list(g)[0][2] assert l1.datatype == RDF.XMLLiteral