Skip to content

Commit

Permalink
TriGParser: distinguish empty entities from no-entity being read (#42)
Browse files Browse the repository at this point in the history
* TriGParser: distinguish empty entities from no-etity being read

See #37
(closes #37)

* TriGParserTest::testBlankNodes() adjusted

* removed the prefixed-only IRIs input line from the first test scenario
  as this does not belong to the testBlankNodes() tests and is tested
  aleady in testIssue37()
* turned the empty prefixed IRIs test scenario into two - first, where
  and error is expected due to unknown document base IRI and second,
  where parsing succeeds thanks to `documentIRI` parser option being set

* Update test/TriGParserTest.php

* Update test/TriGParserTest.php

---------

Co-authored-by: Konrad Abicht <hi@inspirito.de>
  • Loading branch information
zozlak and k00ni authored Feb 28, 2024
1 parent f70f13e commit 01bf556
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 10 deletions.
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,33 @@ $parser->end(); //Needs to be called
* `end(): array<array{'subject': string, 'predicate': string, 'object': string, 'graph': string}>`
* `explicitQuantifiers` - [...]

#### Empty document base IRI

Some Turtle and N3 documents may use relative-to-the-base-IRI IRI syntax (see [here](https://www.w3.org/TR/turtle/#sec-iri) and [here](https://www.w3.org/TR/turtle/#sec-iri-references)), e.g.

```
<> <someProperty> "some value" .
```

To properly parse such documents the document base IRI must be known.
Otherwise we might end up with empty IRIs (e.g. for the subject in the example above).

Sometimes the base IRI is encoded in the document, e.g.

```
@base <http://some.base/iri/> .
<> <someProperty> "some value" .
```

but sometimes it is missing.
In such a case the [Turtle specification](https://www.w3.org/TR/turtle/#in-html-parsing) requires us to follow section 5.1.1 of the [RFC3986](http://www.ietf.org/rfc/rfc3986.txt) which says that if the base IRI is not encapsulated in the document, it should be assumed to be the document retrieval URI (e.g. the URL you downloaded the document from or a file path converted to an URL). Unfortunatelly this can not be guessed by the hardf parser and has to be provided by you using the `documentIRI` parser creation option, e.g.

```php
parser = new TriGParser(["documentIRI" => "http://some.base/iri/"]);
```

Long story short if you run into the `subject/predicate/object on line X can not be parsed without knowing the the document base IRI.(...)` error, please initialize the parser with the `documentIRI` option.

### Utility
```php
use pietercolpaert\hardf\Util;
Expand Down
17 changes: 13 additions & 4 deletions src/TriGParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ private function initReaders()
// Read the subject entity
$this->subject = \call_user_func($this->readEntity, $token);
if (null == $this->subject) {
return;
throw $this->getNoBaseUriException('subject', $token['line']);
}
// In N3 mode, the subject might be a path
if ($this->n3Mode) {
Expand Down Expand Up @@ -398,7 +398,7 @@ private function initReaders()
default:
$this->predicate = \call_user_func($this->readEntity, $token);
if (null == $this->predicate) {
return;
throw $this->getNoBaseUriException('predicate', $token['line']);
}
}
// The next token must be an object
Expand Down Expand Up @@ -437,7 +437,7 @@ private function initReaders()
// Read the object entity
$this->object = \call_user_func($this->readEntity, $token);
if (null == $this->object) {
return;
throw $this->getNoBaseUriException('object', $token['line']);
}
// In N3 mode, the object might be a path
if ($this->n3Mode) {
Expand Down Expand Up @@ -575,7 +575,7 @@ private function initReaders()
default:
$item = \call_user_func($this->readEntity, $token);
if (null == $item) {
return;
throw $this->getNoBaseUriException('list item', $token['line']);
}
}

Expand Down Expand Up @@ -1218,4 +1218,13 @@ public function end()
{
return $this->parseChunk('', true);
}

private function getNoBaseUriException($location, $line)
{
return new \Exception(
"$location on line $line can not be parsed without knowing the the document base IRI.\n".
"Please set the document base IRI using the documentIRI parser configuration option.\n".
"See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI ."
);
}
}
51 changes: 45 additions & 6 deletions test/TriGParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ public function shouldNotParse($createParser, $input, $expectedError = null): vo
$this->fail("Expected this error to be thrown (but it wasn't): ".$expectedError);
}
});
if (false === $errorReceived) {
$this->fail("Expected this error to be thrown (but it wasn't): ".$expectedError);
}
}

/**
Expand Down Expand Up @@ -270,13 +273,19 @@ public function testZeroOrMoreTriples(): void

public function testBlankNodes(): void
{
// should parse diamonds
$this->shouldParse("<> <> <> <>.\n(<>) <> (<>) <>.",
['', '', '', ''],
['_:b0', '', '_:b1', ''],
['_:b0', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', ''],
// should throw an error on empty list item with lacking document base IRI
$this->shouldNotParse("(<>) <> (<>) <>.",
"list item on line 1 can not be parsed without knowing the the document base IRI.\n".
"Please set the document base IRI using the documentIRI parser configuration option.\n".
"See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI .");

// but should manage if the parser has documentIRI set
$this->shouldParse(function () { return new TriGParser(['documentIRI' => 'http://base/']); },
"(<>) <> (<>) <>.",
['_:b0', 'http://base/', '_:b1', 'http://base/'],
['_:b0', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', 'http://base/'],
['_:b0', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#rest', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'],
['_:b1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', ''],
['_:b1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', 'http://base/'],
['_:b1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#rest', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#nil']);

// should parse statements with named blank nodes
Expand Down Expand Up @@ -2067,4 +2076,34 @@ public function testResolve(): void
$this->itShouldResolve('http://abc/def/ghi?q=xx/yyy/z', 'jjj', 'http://abc/def/jjj');
$this->itShouldResolve('http://abc/def/ghi?q=xx/y?y/z', 'jjj', 'http://abc/def/jjj');
}

// https://github.com/pietercolpaert/hardf/issues/37
public function testIssue37(): void {
// should throw an error on empty subject/predicate/object
$errSuffix = " on line 1 can not be parsed without knowing the the document base IRI.\n".
"Please set the document base IRI using the documentIRI parser configuration option.\n".
"See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI .";
$this->shouldNotParse('<> <b> <c> .', 'subject' . $errSuffix);
$this->shouldNotParse('<a> <> <c> .', 'predicate' . $errSuffix);
$this->shouldNotParse('<a> <b> <> .', 'object' . $errSuffix);

// but should manage with documentIRI being set or @base in the turle
$this->shouldParse(
"@base <http://base/> .\n".
"<> <b> <c> .\n".
"<a> <> <c> .\n".
"<a> <b> <> .",
['http://base/', 'http://base/b', 'http://base/c'],
['http://base/a', 'http://base/', 'http://base/c'],
['http://base/a', 'http://base/b', 'http://base/']);

$parser = function () { return new TriGParser(['documentIRI' => 'http://base/']); };
$this->shouldParse($parser,
"<> <b> <c> .\n".
"<a> <> <c> .\n".
"<a> <b> <> .",
['http://base/', 'http://base/b', 'http://base/c'],
['http://base/a', 'http://base/', 'http://base/c'],
['http://base/a', 'http://base/b', 'http://base/']);
}
}

0 comments on commit 01bf556

Please sign in to comment.