@@ -238,6 +238,33 @@ function convertTimeFormat($time) {
238
238
}
239
239
}
240
240
241
+ /**
242
+ * If a date value has a timezone offset, normalize it.
243
+ * @param string $dtValue
244
+ * @return string isolated, normalized TZ offset for implied TZ for other dt- properties
245
+ */
246
+ function normalizeTimezoneOffset (&$ dtValue ) {
247
+ preg_match ('/Z|[+-]\d{1,2}:?(\d{2})?$/i ' , $ dtValue , $ matches );
248
+
249
+ if (empty ($ matches )) {
250
+ return null ;
251
+ }
252
+
253
+ if ( $ matches [0 ] != 'Z ' ) {
254
+ $ timezoneString = str_replace (': ' , '' , $ matches [0 ]);
255
+ $ plus_minus = substr ($ timezoneString , 0 , 1 );
256
+ $ timezoneOffset = substr ($ timezoneString , 1 );
257
+ if ( strlen ($ timezoneOffset ) <= 2 ) {
258
+ $ timezoneOffset .= '00 ' ;
259
+ }
260
+ $ timezoneOffset = str_pad ($ timezoneOffset , 4 , 0 , STR_PAD_LEFT );
261
+ $ timezoneOffset = $ plus_minus . $ timezoneOffset ;
262
+ $ dtValue = preg_replace ('/Z?[+-]\d{1,2}:?(\d{2})?$/i ' , $ timezoneOffset , $ dtValue );
263
+ }
264
+
265
+ return $ timezoneOffset ;
266
+ }
267
+
241
268
function applySrcsetUrlTransformation ($ srcset , $ transformation ) {
242
269
return implode (', ' , array_filter (array_map (function ($ srcsetPart ) use ($ transformation ) {
243
270
$ parts = explode (" \t\n\r\0\x0B" , trim ($ srcsetPart ), 2 );
@@ -652,9 +679,10 @@ public function parseU(\DOMElement $u) {
652
679
*
653
680
* @param DOMElement $dt The element to parse
654
681
* @param array $dates Array of dates processed so far
682
+ * @param string $impliedTimezone
655
683
* @return string The datetime string found
656
684
*/
657
- public function parseDT (\DOMElement $ dt , &$ dates = array ()) {
685
+ public function parseDT (\DOMElement $ dt , &$ dates = array (), & $ impliedTimezone = null ) {
658
686
// Check for value-class pattern
659
687
$ valueClassChildren = $ this ->xpath ->query ('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")] ' , $ dt );
660
688
$ dtValue = false ;
@@ -666,73 +694,96 @@ public function parseDT(\DOMElement $dt, &$dates = array()) {
666
694
foreach ($ valueClassChildren as $ e ) {
667
695
if (strstr (' ' . $ e ->getAttribute ('class ' ) . ' ' , ' value-title ' )) {
668
696
$ title = $ e ->getAttribute ('title ' );
669
- if (!empty ($ title ))
697
+ if (!empty ($ title )) {
670
698
$ dateParts [] = $ title ;
699
+ }
671
700
}
672
701
elseif ($ e ->tagName == 'img ' or $ e ->tagName == 'area ' ) {
673
702
// Use @alt
674
703
$ alt = $ e ->getAttribute ('alt ' );
675
- if (!empty ($ alt ))
704
+ if (!empty ($ alt )) {
676
705
$ dateParts [] = $ alt ;
706
+ }
677
707
}
678
708
elseif ($ e ->tagName == 'data ' ) {
679
709
// Use @value, otherwise innertext
680
710
$ value = $ e ->hasAttribute ('value ' ) ? $ e ->getAttribute ('value ' ) : unicodeTrim ($ e ->nodeValue );
681
- if (!empty ($ value ))
711
+ if (!empty ($ value )) {
682
712
$ dateParts [] = $ value ;
713
+ }
683
714
}
684
715
elseif ($ e ->tagName == 'abbr ' ) {
685
716
// Use @title, otherwise innertext
686
717
$ title = $ e ->hasAttribute ('title ' ) ? $ e ->getAttribute ('title ' ) : unicodeTrim ($ e ->nodeValue );
687
- if (!empty ($ title ))
718
+ if (!empty ($ title )) {
688
719
$ dateParts [] = $ title ;
720
+ }
689
721
}
690
722
elseif ($ e ->tagName == 'del ' or $ e ->tagName == 'ins ' or $ e ->tagName == 'time ' ) {
691
723
// Use @datetime if available, otherwise innertext
692
724
$ dtAttr = ($ e ->hasAttribute ('datetime ' )) ? $ e ->getAttribute ('datetime ' ) : unicodeTrim ($ e ->nodeValue );
693
- if (!empty ($ dtAttr ))
725
+ if (!empty ($ dtAttr )) {
694
726
$ dateParts [] = $ dtAttr ;
727
+ }
695
728
}
696
729
else {
697
- if (!empty ($ e ->nodeValue ))
730
+ if (!empty ($ e ->nodeValue )) {
698
731
$ dateParts [] = unicodeTrim ($ e ->nodeValue );
732
+ }
699
733
}
700
734
}
701
735
702
736
// Look through dateParts
703
737
$ datePart = '' ;
704
738
$ timePart = '' ;
739
+ $ timezonePart = '' ;
705
740
foreach ($ dateParts as $ part ) {
706
741
// Is this part a full ISO8601 datetime?
707
- if (preg_match ('/^\d{4}-\d{2}-\d{2}T \d{2}:\d{2}(?:: \d{2})?(?:Z?[+| -]\d{2}:?\d{2})?$/ ' , $ part )) {
742
+ if (preg_match ('/^\d{4}-\d{2}-\d{2}[ T] \d{2}:\d{2}(: \d{2})?(Z|[+ -]\d{2}:?\d{2})?$/ ' , $ part )) {
708
743
// Break completely, we’ve got our value.
709
744
$ dtValue = $ part ;
710
745
break ;
711
746
} else {
712
747
// Is the current part a valid time(+TZ?) AND no other time representation has been found?
713
- if ((preg_match ('/\d{1,2}:\d{1, 2}(Z?[+| -]\d{2}:?\d{2})?/ ' , $ part ) or preg_match ('/\d{1,2}[a|p]m/ ' , $ part )) and empty ($ timePart )) {
748
+ if ((preg_match ('/^ \d{1,2}:\d{2}(:\d{2})?(Z|[+ -]\d{1, 2}:?\d{2})?$ / ' , $ part ) or preg_match ('/^ \d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i ' , $ part )) and empty ($ timePart )) {
714
749
$ timePart = $ part ;
715
- } elseif (preg_match ('/\d{4}-\d{2}-\d{2}/ ' , $ part ) and empty ($ datePart )) {
750
+
751
+ $ timezoneOffset = normalizeTimezoneOffset ($ timePart );
752
+ if (!$ impliedTimezone && $ timezoneOffset ) {
753
+ $ impliedTimezone = $ timezoneOffset ;
754
+ }
755
+ } elseif (preg_match ('/^\d{4}-\d{2}-\d{2}$/ ' , $ part ) and empty ($ datePart )) {
716
756
// Is the current part a valid date AND no other date representation has been found?
717
757
$ datePart = $ part ;
758
+ } elseif (preg_match ('/^(Z|[+-]\d{1,2}:?(\d{2})?)$/ ' , $ part ) and empty ($ timezonePart )) {
759
+ $ timezonePart = $ part ;
760
+
761
+ $ timezoneOffset = normalizeTimezoneOffset ($ timezonePart );
762
+ if (!$ impliedTimezone && $ timezoneOffset ) {
763
+ $ impliedTimezone = $ timezoneOffset ;
764
+ }
718
765
}
719
766
720
767
if ( !empty ($ datePart ) && !in_array ($ datePart , $ dates ) ) {
721
768
$ dates [] = $ datePart ;
722
769
}
723
770
771
+ if (!empty ($ timezonePart ) && !empty ($ timePart )) {
772
+ $ timePart .= $ timezonePart ;
773
+ }
774
+
724
775
$ dtValue = '' ;
725
776
726
777
if ( empty ($ datePart ) && !empty ($ timePart ) ) {
727
778
$ timePart = convertTimeFormat ($ timePart );
728
- $ dtValue = unicodeTrim ($ timePart, ' T ' );
779
+ $ dtValue = unicodeTrim ($ timePart );
729
780
}
730
781
else if ( !empty ($ datePart ) && empty ($ timePart ) ) {
731
782
$ dtValue = rtrim ($ datePart , 'T ' );
732
783
}
733
784
else {
734
785
$ timePart = convertTimeFormat ($ timePart );
735
- $ dtValue = rtrim ($ datePart , 'T ' ) . 'T ' . unicodeTrim ($ timePart, ' T ' );
786
+ $ dtValue = rtrim ($ datePart , 'T ' ) . ' ' . unicodeTrim ($ timePart );
736
787
}
737
788
}
738
789
}
@@ -742,36 +793,54 @@ public function parseDT(\DOMElement $dt, &$dates = array()) {
742
793
// Use @alt
743
794
// Is it an entire dt?
744
795
$ alt = $ dt ->getAttribute ('alt ' );
745
- if (!empty ($ alt ))
796
+ if (!empty ($ alt )) {
746
797
$ dtValue = $ alt ;
798
+ }
747
799
} elseif (in_array ($ dt ->tagName , array ('data ' ))) {
748
800
// Use @value, otherwise innertext
749
801
// Is it an entire dt?
750
802
$ value = $ dt ->getAttribute ('value ' );
751
- if (!empty ($ value ))
803
+ if (!empty ($ value )) {
752
804
$ dtValue = $ value ;
753
- else
805
+ }
806
+ else {
754
807
$ dtValue = $ this ->textContent ($ dt );
808
+ }
755
809
} elseif ($ dt ->tagName == 'abbr ' ) {
756
810
// Use @title, otherwise innertext
757
811
// Is it an entire dt?
758
812
$ title = $ dt ->getAttribute ('title ' );
759
- if (!empty ($ title ))
813
+ if (!empty ($ title )) {
760
814
$ dtValue = $ title ;
761
- else
815
+ }
816
+ else {
762
817
$ dtValue = $ this ->textContent ($ dt );
818
+ }
763
819
} elseif ($ dt ->tagName == 'del ' or $ dt ->tagName == 'ins ' or $ dt ->tagName == 'time ' ) {
764
820
// Use @datetime if available, otherwise innertext
765
821
// Is it an entire dt?
766
822
$ dtAttr = $ dt ->getAttribute ('datetime ' );
767
- if (!empty ($ dtAttr ))
823
+ if (!empty ($ dtAttr )) {
768
824
$ dtValue = $ dtAttr ;
769
- else
825
+ }
826
+ else {
770
827
$ dtValue = $ this ->textContent ($ dt );
828
+ }
829
+
771
830
} else {
772
831
$ dtValue = $ this ->textContent ($ dt );
773
832
}
774
833
834
+ // if the dtValue is not just YYYY-MM-DD, normalize the timezone offset
835
+ if (!preg_match ('/^(\d{4}-\d{2}-\d{2})$/ ' , $ dtValue )) {
836
+ $ timezoneOffset = normalizeTimezoneOffset ($ dtValue );
837
+ if (!$ impliedTimezone && $ timezoneOffset ) {
838
+ $ impliedTimezone = $ timezoneOffset ;
839
+ }
840
+ }
841
+
842
+ $ dtValue = unicodeTrim ($ dtValue );
843
+
775
844
if (preg_match ('/(\d{4}-\d{2}-\d{2})/ ' , $ dtValue , $ matches )) {
776
845
$ dates [] = $ matches [0 ];
777
846
}
@@ -781,9 +850,14 @@ public function parseDT(\DOMElement $dt, &$dates = array()) {
781
850
* if $dtValue is only a time and there are recently parsed dates,
782
851
* form the full date-time using the most recently parsed dt- value
783
852
*/
784
- if ((preg_match ('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/ ' , $ dtValue ) or preg_match ('/^\d{1,2}[a|p]m/ ' , $ dtValue )) && !empty ($ dates )) {
853
+ if ((preg_match ('/^\d{1,2}:\d{2}(:\d{2})?(Z|[+-]\d{2}:?\d{2}?)?$/ ' , $ dtValue ) or preg_match ('/^\d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i ' , $ dtValue )) && !empty ($ dates )) {
854
+ $ timezoneOffset = normalizeTimezoneOffset ($ dtValue );
855
+ if (!$ impliedTimezone && $ timezoneOffset ) {
856
+ $ impliedTimezone = $ timezoneOffset ;
857
+ }
858
+
785
859
$ dtValue = convertTimeFormat ($ dtValue );
786
- $ dtValue = end ($ dates ) . 'T ' . unicodeTrim ($ dtValue, ' T ' );
860
+ $ dtValue = end ($ dates ) . ' ' . unicodeTrim ($ dtValue );
787
861
}
788
862
789
863
return $ dtValue ;
@@ -857,6 +931,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
857
931
$ return = array ();
858
932
$ children = array ();
859
933
$ dates = array ();
934
+ $ impliedTimezone = null ;
860
935
861
936
// each rel-bookmark with an href attribute
862
937
foreach ( $ this ->xpath ->query ('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href] ' , $ e ) as $ el )
@@ -956,25 +1031,37 @@ public function parseH(\DOMElement $e, $is_backcompat = false) {
956
1031
$ this ->elementPrefixParsed ($ u , 'u ' );
957
1032
}
958
1033
1034
+ $ temp_dates = array ();
1035
+
959
1036
// Handle dt-*
960
1037
foreach ($ this ->xpath ->query ('.//*[contains(concat(" ", @class), " dt-")] ' , $ e ) as $ dt ) {
961
1038
if ($ this ->isElementParsed ($ dt , 'dt ' )) {
962
1039
continue ;
963
1040
}
964
1041
965
- $ dtValue = $ this ->parseDT ($ dt , $ dates );
1042
+ $ dtValue = $ this ->parseDT ($ dt , $ dates, $ impliedTimezone );
966
1043
967
1044
if ($ dtValue ) {
968
1045
// Add the value to the array for dt- properties
969
1046
foreach (mfNamesFromElement ($ dt , 'dt- ' ) as $ propName ) {
970
- $ return [$ propName ][] = $ dtValue ;
1047
+ $ temp_dates [$ propName ][] = $ dtValue ;
971
1048
}
972
1049
}
973
-
974
1050
// Make sure this sub-mf won’t get parsed as a top level mf
975
1051
$ this ->elementPrefixParsed ($ dt , 'dt ' );
976
1052
}
977
1053
1054
+ foreach ($ temp_dates as $ propName => $ data ) {
1055
+ foreach ( $ data as $ dtValue ) {
1056
+ // var_dump(preg_match('/[+-]\d{2}(\d{2})?$/i', $dtValue));
1057
+ if ( $ impliedTimezone && preg_match ('/[+-]\d{2}(\d{2})?$/i ' , $ dtValue , $ matches ) == 0 ) {
1058
+ $ dtValue .= $ impliedTimezone ;
1059
+ }
1060
+
1061
+ $ return [$ propName ][] = $ dtValue ;
1062
+ }
1063
+ }
1064
+
978
1065
// Handle e-*
979
1066
foreach ($ this ->xpath ->query ('.//*[contains(concat(" ", @class)," e-")] ' , $ e ) as $ em ) {
980
1067
if ($ this ->isElementParsed ($ em , 'e ' )) {
0 commit comments