Skip to content

Commit 23c5fa6

Browse files
authored
Merge pull request #131 from gRegorLove/issue115
Improve dt- parsing
2 parents 4266673 + 934091e commit 23c5fa6

File tree

4 files changed

+310
-39
lines changed

4 files changed

+310
-39
lines changed

Mf2/Parser.php

Lines changed: 111 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,33 @@ function convertTimeFormat($time) {
238238
}
239239
}
240240

241+
/**
242+
* If a date value has a timezone offset, normalize it.
243+
* @param string $dtValue
244+
* @return string isolated, normalized TZ offset for implied TZ for other dt- properties
245+
*/
246+
function normalizeTimezoneOffset(&$dtValue) {
247+
preg_match('/Z|[+-]\d{1,2}:?(\d{2})?$/i', $dtValue, $matches);
248+
249+
if (empty($matches)) {
250+
return null;
251+
}
252+
253+
if ( $matches[0] != 'Z' ) {
254+
$timezoneString = str_replace(':', '', $matches[0]);
255+
$plus_minus = substr($timezoneString, 0, 1);
256+
$timezoneOffset = substr($timezoneString, 1);
257+
if ( strlen($timezoneOffset) <= 2 ) {
258+
$timezoneOffset .= '00';
259+
}
260+
$timezoneOffset = str_pad($timezoneOffset, 4, 0, STR_PAD_LEFT);
261+
$timezoneOffset = $plus_minus . $timezoneOffset;
262+
$dtValue = preg_replace('/Z?[+-]\d{1,2}:?(\d{2})?$/i', $timezoneOffset, $dtValue);
263+
}
264+
265+
return $timezoneOffset;
266+
}
267+
241268
function applySrcsetUrlTransformation($srcset, $transformation) {
242269
return implode(', ', array_filter(array_map(function ($srcsetPart) use ($transformation) {
243270
$parts = explode(" \t\n\r\0\x0B", trim($srcsetPart), 2);
@@ -652,9 +679,10 @@ public function parseU(\DOMElement $u) {
652679
*
653680
* @param DOMElement $dt The element to parse
654681
* @param array $dates Array of dates processed so far
682+
* @param string $impliedTimezone
655683
* @return string The datetime string found
656684
*/
657-
public function parseDT(\DOMElement $dt, &$dates = array()) {
685+
public function parseDT(\DOMElement $dt, &$dates = array(), &$impliedTimezone = null) {
658686
// Check for value-class pattern
659687
$valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
660688
$dtValue = false;
@@ -666,73 +694,96 @@ public function parseDT(\DOMElement $dt, &$dates = array()) {
666694
foreach ($valueClassChildren as $e) {
667695
if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
668696
$title = $e->getAttribute('title');
669-
if (!empty($title))
697+
if (!empty($title)) {
670698
$dateParts[] = $title;
699+
}
671700
}
672701
elseif ($e->tagName == 'img' or $e->tagName == 'area') {
673702
// Use @alt
674703
$alt = $e->getAttribute('alt');
675-
if (!empty($alt))
704+
if (!empty($alt)) {
676705
$dateParts[] = $alt;
706+
}
677707
}
678708
elseif ($e->tagName == 'data') {
679709
// Use @value, otherwise innertext
680710
$value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
681-
if (!empty($value))
711+
if (!empty($value)) {
682712
$dateParts[] = $value;
713+
}
683714
}
684715
elseif ($e->tagName == 'abbr') {
685716
// Use @title, otherwise innertext
686717
$title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
687-
if (!empty($title))
718+
if (!empty($title)) {
688719
$dateParts[] = $title;
720+
}
689721
}
690722
elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
691723
// Use @datetime if available, otherwise innertext
692724
$dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
693-
if (!empty($dtAttr))
725+
if (!empty($dtAttr)) {
694726
$dateParts[] = $dtAttr;
727+
}
695728
}
696729
else {
697-
if (!empty($e->nodeValue))
730+
if (!empty($e->nodeValue)) {
698731
$dateParts[] = unicodeTrim($e->nodeValue);
732+
}
699733
}
700734
}
701735

702736
// Look through dateParts
703737
$datePart = '';
704738
$timePart = '';
739+
$timezonePart = '';
705740
foreach ($dateParts as $part) {
706741
// Is this part a full ISO8601 datetime?
707-
if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
742+
if (preg_match('/^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}(:\d{2})?(Z|[+-]\d{2}:?\d{2})?$/', $part)) {
708743
// Break completely, we’ve got our value.
709744
$dtValue = $part;
710745
break;
711746
} else {
712747
// Is the current part a valid time(+TZ?) AND no other time representation has been found?
713-
if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
748+
if ((preg_match('/^\d{1,2}:\d{2}(:\d{2})?(Z|[+-]\d{1,2}:?\d{2})?$/', $part) or preg_match('/^\d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i', $part)) and empty($timePart)) {
714749
$timePart = $part;
715-
} elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
750+
751+
$timezoneOffset = normalizeTimezoneOffset($timePart);
752+
if (!$impliedTimezone && $timezoneOffset) {
753+
$impliedTimezone = $timezoneOffset;
754+
}
755+
} elseif (preg_match('/^\d{4}-\d{2}-\d{2}$/', $part) and empty($datePart)) {
716756
// Is the current part a valid date AND no other date representation has been found?
717757
$datePart = $part;
758+
} elseif (preg_match('/^(Z|[+-]\d{1,2}:?(\d{2})?)$/', $part) and empty($timezonePart)) {
759+
$timezonePart = $part;
760+
761+
$timezoneOffset = normalizeTimezoneOffset($timezonePart);
762+
if (!$impliedTimezone && $timezoneOffset) {
763+
$impliedTimezone = $timezoneOffset;
764+
}
718765
}
719766

720767
if ( !empty($datePart) && !in_array($datePart, $dates) ) {
721768
$dates[] = $datePart;
722769
}
723770

771+
if (!empty($timezonePart) && !empty($timePart)) {
772+
$timePart .= $timezonePart;
773+
}
774+
724775
$dtValue = '';
725776

726777
if ( empty($datePart) && !empty($timePart) ) {
727778
$timePart = convertTimeFormat($timePart);
728-
$dtValue = unicodeTrim($timePart, 'T');
779+
$dtValue = unicodeTrim($timePart);
729780
}
730781
else if ( !empty($datePart) && empty($timePart) ) {
731782
$dtValue = rtrim($datePart, 'T');
732783
}
733784
else {
734785
$timePart = convertTimeFormat($timePart);
735-
$dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
786+
$dtValue = rtrim($datePart, 'T') . ' ' . unicodeTrim($timePart);
736787
}
737788
}
738789
}
@@ -742,36 +793,54 @@ public function parseDT(\DOMElement $dt, &$dates = array()) {
742793
// Use @alt
743794
// Is it an entire dt?
744795
$alt = $dt->getAttribute('alt');
745-
if (!empty($alt))
796+
if (!empty($alt)) {
746797
$dtValue = $alt;
798+
}
747799
} elseif (in_array($dt->tagName, array('data'))) {
748800
// Use @value, otherwise innertext
749801
// Is it an entire dt?
750802
$value = $dt->getAttribute('value');
751-
if (!empty($value))
803+
if (!empty($value)) {
752804
$dtValue = $value;
753-
else
805+
}
806+
else {
754807
$dtValue = $this->textContent($dt);
808+
}
755809
} elseif ($dt->tagName == 'abbr') {
756810
// Use @title, otherwise innertext
757811
// Is it an entire dt?
758812
$title = $dt->getAttribute('title');
759-
if (!empty($title))
813+
if (!empty($title)) {
760814
$dtValue = $title;
761-
else
815+
}
816+
else {
762817
$dtValue = $this->textContent($dt);
818+
}
763819
} elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
764820
// Use @datetime if available, otherwise innertext
765821
// Is it an entire dt?
766822
$dtAttr = $dt->getAttribute('datetime');
767-
if (!empty($dtAttr))
823+
if (!empty($dtAttr)) {
768824
$dtValue = $dtAttr;
769-
else
825+
}
826+
else {
770827
$dtValue = $this->textContent($dt);
828+
}
829+
771830
} else {
772831
$dtValue = $this->textContent($dt);
773832
}
774833

834+
// if the dtValue is not just YYYY-MM-DD, normalize the timezone offset
835+
if (!preg_match('/^(\d{4}-\d{2}-\d{2})$/', $dtValue)) {
836+
$timezoneOffset = normalizeTimezoneOffset($dtValue);
837+
if (!$impliedTimezone && $timezoneOffset) {
838+
$impliedTimezone = $timezoneOffset;
839+
}
840+
}
841+
842+
$dtValue = unicodeTrim($dtValue);
843+
775844
if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
776845
$dates[] = $matches[0];
777846
}
@@ -781,9 +850,14 @@ public function parseDT(\DOMElement $dt, &$dates = array()) {
781850
* if $dtValue is only a time and there are recently parsed dates,
782851
* form the full date-time using the most recently parsed dt- value
783852
*/
784-
if ((preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates)) {
853+
if ((preg_match('/^\d{1,2}:\d{2}(:\d{2})?(Z|[+-]\d{2}:?\d{2}?)?$/', $dtValue) or preg_match('/^\d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i', $dtValue)) && !empty($dates)) {
854+
$timezoneOffset = normalizeTimezoneOffset($dtValue);
855+
if (!$impliedTimezone && $timezoneOffset) {
856+
$impliedTimezone = $timezoneOffset;
857+
}
858+
785859
$dtValue = convertTimeFormat($dtValue);
786-
$dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
860+
$dtValue = end($dates) . ' ' . unicodeTrim($dtValue);
787861
}
788862

789863
return $dtValue;
@@ -857,6 +931,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
857931
$return = array();
858932
$children = array();
859933
$dates = array();
934+
$impliedTimezone = null;
860935

861936
// each rel-bookmark with an href attribute
862937
foreach ( $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $e) as $el )
@@ -956,25 +1031,37 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
9561031
$this->elementPrefixParsed($u, 'u');
9571032
}
9581033

1034+
$temp_dates = array();
1035+
9591036
// Handle dt-*
9601037
foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
9611038
if ($this->isElementParsed($dt, 'dt')) {
9621039
continue;
9631040
}
9641041

965-
$dtValue = $this->parseDT($dt, $dates);
1042+
$dtValue = $this->parseDT($dt, $dates, $impliedTimezone);
9661043

9671044
if ($dtValue) {
9681045
// Add the value to the array for dt- properties
9691046
foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
970-
$return[$propName][] = $dtValue;
1047+
$temp_dates[$propName][] = $dtValue;
9711048
}
9721049
}
973-
9741050
// Make sure this sub-mf won’t get parsed as a top level mf
9751051
$this->elementPrefixParsed($dt, 'dt');
9761052
}
9771053

1054+
foreach ($temp_dates as $propName => $data) {
1055+
foreach ( $data as $dtValue ) {
1056+
// var_dump(preg_match('/[+-]\d{2}(\d{2})?$/i', $dtValue));
1057+
if ( $impliedTimezone && preg_match('/[+-]\d{2}(\d{2})?$/i', $dtValue, $matches) == 0 ) {
1058+
$dtValue .= $impliedTimezone;
1059+
}
1060+
1061+
$return[$propName][] = $dtValue;
1062+
}
1063+
}
1064+
9781065
// Handle e-*
9791066
foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
9801067
if ($this->isElementParsed($em, 'e')) {

tests/Mf2/ClassicMicroformatsTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ public function test_vevent() {
167167
$this->assertEquals('XYZ Project Review', $output['items'][0]['properties']['name'][0]);
168168
$this->assertEquals('Project XYZ Review Meeting', $output['items'][0]['properties']['description'][0]);
169169
$this->assertEquals('http://example.com/xyz-meeting', $output['items'][0]['properties']['url'][0]);
170-
$this->assertEquals('1998-03-12T08:30', $output['items'][0]['properties']['start'][0]);
171-
$this->assertEquals('1998-03-12T09:30', $output['items'][0]['properties']['end'][0]);
170+
$this->assertEquals('1998-03-12 08:30-0500', $output['items'][0]['properties']['start'][0]);
171+
$this->assertEquals('1998-03-12 09:30-0500', $output['items'][0]['properties']['end'][0]);
172172
}
173173

174174

0 commit comments

Comments
 (0)