From c1f0aea1d61056df8232507f523ed309f4ec6566 Mon Sep 17 00:00:00 2001 From: "Vincent A. Cicirello" Date: Sat, 8 Jun 2024 14:22:16 -0400 Subject: [PATCH] Escape characters that must be escaped in XML (#124) * fixed xml special chars * doc update * Update action.yml * Update CHANGELOG.md --- CHANGELOG.md | 3 ++- README.md | 12 +++++------ action.yml | 2 +- generatesitemap.py | 23 ++++++++++++++++++-- tests/tests.py | 52 +++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 81 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02e10e75..8116e2da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] - 2024-05-20 +## [Unreleased] - 2024-06-08 ### Added @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### Fixed +* Escape characters that must be escaped in XML. ### CI/CD diff --git a/README.md b/README.md index db0627df..59d89c26 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ sure to include the following as a step in your workflow: ```yml steps: - name: Checkout the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 ``` @@ -242,7 +242,7 @@ you can also use a specific version such as with: ```yml - name: Generate the sitemap - uses: cicirello/generate-sitemap@v1.10.0 + uses: cicirello/generate-sitemap@v1.10.1 with: base-url-path: https://THE.URL.TO.YOUR.PAGE/ ``` @@ -268,7 +268,7 @@ jobs: steps: - name: Checkout the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -306,7 +306,7 @@ jobs: steps: - name: Checkout the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -348,7 +348,7 @@ jobs: steps: - name: Checkout the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -389,7 +389,7 @@ jobs: steps: - name: Checkout the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/action.yml b/action.yml index 5eaa5d83..9bd091f2 100644 --- a/action.yml +++ b/action.yml @@ -1,6 +1,6 @@ # generate-sitemap: Github action for automating sitemap generation # -# Copyright (c) 2020-2023 Vincent A Cicirello +# Copyright (c) 2020-2024 Vincent A Cicirello # https://www.cicirello.org/ # # MIT License diff --git a/generatesitemap.py b/generatesitemap.py index 3150bef2..d6766891 100755 --- a/generatesitemap.py +++ b/generatesitemap.py @@ -2,7 +2,7 @@ # # generate-sitemap: Github action for automating sitemap generation # -# Copyright (c) 2020-2023 Vincent A Cicirello +# Copyright (c) 2020-2024 Vincent A Cicirello # https://www.cicirello.org/ # # MIT License @@ -262,6 +262,25 @@ def removeTime(dateString) : """ return dateString[:10] +def xmlEscapeCharacters(f): + """Escapes any characters that XML requires escaped, such as + ampersands, etc. + + Keyword arguments: + f - the filename + """ + return f.replace( + "&", "&" + ).replace( + "<", "<" + ).replace( + ">", ">" + ).replace( + "'", "'" + ).replace( + '"', """ + ) + def xmlSitemapEntry(f, baseUrl, dateString, dropExtension=False, dateOnly=False) : """Forms a string with an entry formatted for an xml sitemap including lastmod date. @@ -273,7 +292,7 @@ def xmlSitemapEntry(f, baseUrl, dateString, dropExtension=False, dateOnly=False) dropExtension - true to drop extensions of .html from the filename in urls """ return xmlSitemapEntryTemplate.format( - urlstring(f, baseUrl, dropExtension), + urlstring(xmlEscapeCharacters(f), baseUrl, dropExtension), removeTime(dateString) if dateOnly else dateString ) diff --git a/tests/tests.py b/tests/tests.py index a78dc5dc..4a606d27 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,6 +1,6 @@ # generate-sitemap: Github action for automating sitemap generation # -# Copyright (c) 2020-2023 Vincent A Cicirello +# Copyright (c) 2020-2024 Vincent A Cicirello # https://www.cicirello.org/ # # MIT License @@ -590,6 +590,26 @@ def test_removeTime(self) : date = "2020-09-11T13:35:00-04:00" expected = "2020-09-11" self.assertEqual(expected, gs.removeTime(date)) + + def test_xmlEscapeCharacters(self): + test_strings = [ + "abs&def", + "absdef", + "abs'def", + 'abs"def', + """&<>"'"'><&""" + ] + expected = [ + "abs&def", + "abs<def", + "abs>def", + "abs'def", + "abs"def", + "&<>"'"'><&" + ] + for t, e in zip(test_strings, expected): + self.assertEqual(e, gs.xmlEscapeCharacters(t)) def test_xmlSitemapEntry(self) : base = "https://TESTING.FAKE.WEB.ADDRESS.TESTING/" @@ -613,6 +633,36 @@ def test_xmlSitemapEntryDateOnly(self) : expected = "\nhttps://TESTING.FAKE.WEB.ADDRESS.TESTING/a\n2020-09-11\n" self.assertEqual(actual, expected) + def test_xmlSitemapEntry_withEscapes(self): + base = "https://TESTING.FAKE.WEB.ADDRESS.TESTING/" + f_template = "./a{0}.html" + date = "2020-09-11T13:35:00-04:00" + test_strings = [ + "abs&def", + "absdef", + "abs'def", + 'abs"def', + """&<>"'"'><&""" + ] + expected = [ + "abs&def", + "abs<def", + "abs>def", + "abs'def", + "abs"def", + "&<>"'"'><&" + ] + for t, e in zip(test_strings, expected): + f = f_template.format(t) + self.assertEqual(e, gs.xmlEscapeCharacters(t)) + actual = gs.xmlSitemapEntry(f, base, date) + expected = "\nhttps://TESTING.FAKE.WEB.ADDRESS.TESTING/a{0}.html\n2020-09-11T13:35:00-04:00\n".format(e) + self.assertEqual(actual, expected) + actual = gs.xmlSitemapEntry(f, base, date, True) + expected = "\nhttps://TESTING.FAKE.WEB.ADDRESS.TESTING/a{0}\n2020-09-11T13:35:00-04:00\n".format(e) + self.assertEqual(actual, expected) + def test_robotsTxtParser(self) : expected = [ [], ["/"],