Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weโ€™ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Grapheme Clusters ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง (rebased) #49

Merged
merged 4 commits into from
Apr 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Still we hope it will be useful for everybody else.
- [x] Simple
- [x] With groups
- [x] String extensions
- [x] Supports grapheme clusters ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง

## Extra

Expand Down
6 changes: 3 additions & 3 deletions Sources/Regex/GroupRangeUtils.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ enum InvalidRangeError : Error {

extension GroupRange {
func asRange(ofString source:String) throws -> StringRange {
let len = source.count
let len = source.utf16.count
if self.location < 0 || self.location >= len || self.location + self.length > len {
throw InvalidRangeError.Error
}

let start = source.index(source.startIndex, offsetBy: self.location)
let end = source.index(start, offsetBy: self.length)
let start = source.utf16.index(source.startIndex, offsetBy: self.location)
let end = source.utf16.index(start, offsetBy: self.length)

return start ..< end
}
Expand Down
6 changes: 3 additions & 3 deletions Sources/Regex/Regex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ public class Regex : RegexProtocol {
*/
public func findAll(in source:String) -> MatchSequence {
let options = NSRegularExpression.MatchingOptions(rawValue: 0)
let range = GroupRange(location: 0, length: source.count)
let range = GroupRange(location: 0, length: source.utf16.count)
let context = compiled?.matches(in: source, options: options, range: range)
//hard unwrap of context, because the instance would not exist without it
return MatchSequence(source: source, context: context!, groupNames: groupNames)
Expand All @@ -247,7 +247,7 @@ public class Regex : RegexProtocol {
*/
public func findFirst(in source:String) -> Match? {
let options = NSRegularExpression.MatchingOptions(rawValue: 0)
let range = GroupRange(location: 0, length: source.count)
let range = GroupRange(location: 0, length: source.utf16.count)
let match = compiled?.firstMatch(in: source, options: options, range: range)
return match.map { match in
Match(source: source, match: match, groupNames: groupNames)
Expand All @@ -264,7 +264,7 @@ public class Regex : RegexProtocol {
*/
public func replaceAll(in source:String, with replacement:String) -> String {
let options = NSRegularExpression.MatchingOptions(rawValue: 0)
let range = GroupRange(location: 0, length: source.count)
let range = GroupRange(location: 0, length: source.utf16.count)

return compiled!.stringByReplacingMatches(in: source, options: options, range: range, withTemplate: replacement)
}
Expand Down
64 changes: 63 additions & 1 deletion Tests/RegexTests/RegexTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
//===----------------------------------------------------------------------===//

import XCTest
import Regex
@testable import Regex

class RegexTests: XCTestCase {
static let pattern:String = "(.+?)([1,2,3]*)(.*)"
Expand Down Expand Up @@ -134,6 +134,67 @@ class RegexTests: XCTestCase {
}
XCTAssertEqual("l321321la321a", replaced2)
}

func testGraphemeClusters() {
let testString = """
๐Ÿ‘๐Ÿ‘ Find me. ๐Ÿ‘Œ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘จ\u{200D}๐Ÿ‘ฉ\u{200D}๐Ÿ‘ง๐Ÿ‘Œ
"""
let regex = try! Regex(pattern: "^(๐Ÿ‘+) *([^๐Ÿ‘Œ]+?) *([๐Ÿ‘Œ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง]+)$",
options: [.anchorsMatchLines],
groupNames: [])

guard let firstMatch = regex.findFirst(in: testString) else {
return XCTFail("Failed to find first match using anchored regex.")
}

XCTAssert(firstMatch.group(at: 1) == "๐Ÿ‘๐Ÿ‘",
"Incorrect first capture group for anchored regex.")
XCTAssert(firstMatch.group(at: 2) == "Find me.",
"Incorrect second capture group for anchored regex.")
XCTAssert(firstMatch.group(at: 3) == "๐Ÿ‘Œ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘Œ",
"Incorrect third capture group for anchored regex.")

let englishMatchSequence = regex.findAll(in: testString)
XCTAssert(englishMatchSequence.context.count == 1,
"Failed to find match using anchored regex.")

let englishFirstMatch = englishMatchSequence.makeIterator().next()!
XCTAssert(englishFirstMatch.group(at: 1) == "๐Ÿ‘๐Ÿ‘",
"Incorrect first capture group for anchored regex.")
XCTAssert(englishFirstMatch.group(at: 2) == "Find me.",
"Incorrect second capture group for anchored regex.")
XCTAssert(englishFirstMatch.group(at: 3) == "๐Ÿ‘Œ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘Œ",
"Incorrect third capture group for anchored regex.")


let familyEmojiRegex = try! Regex(pattern: "๐Ÿ‘Œ([๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง]+)",
groupNames: [])
guard let familyFirstMatch = familyEmojiRegex.findFirst(in: testString) else {
return XCTFail("Failed to find first match using family regex.")
}
XCTAssert(familyFirstMatch.matched == "๐Ÿ‘Œ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง",
"Incorrect matched string for family emoji regex.")
XCTAssert(familyFirstMatch.group(at: 1) == "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง",
"Incorrect first capture group for family emoji regex.")


let testInArabic = "ุงุฎุชุจุงุฑ"
let arabicTestString = "๐Ÿ‡ฆ๐Ÿ‡ช ู…ุฑุญุจู‹ุง ุŒ ู‡ุฐุง ุงุฎุชุจุงุฑ"
let arabicRegex = try! Regex(pattern: testInArabic, groupNames: [])
let arabicMatchSequence = arabicRegex.findAll(in: arabicTestString)
XCTAssert(arabicMatchSequence.context.count == 1, "Failed to find match in Arabic test string.")
let arabicFirstMatch = arabicMatchSequence.makeIterator().next()!
XCTAssert(arabicFirstMatch.matched == testInArabic, "Failed to match Arabic.")


let testInThai = "เธ—เธ”เธชเธญเธš"
let thaiTestString = "๐Ÿ‡น๐Ÿ‡ญ เธชเธงเธฑเธชเธ”เธตเธ™เธตเนˆเน€เธ›เน‡เธ™เนเธšเธšเธ—เธ”เธชเธญเธš"
let thaiRegex = try! Regex(pattern: testInThai, groupNames: [])
let thaiMatchSequence = thaiRegex.findAll(in: thaiTestString)
XCTAssert(thaiMatchSequence.context.count == 1, "Failed to find match in Thai test string.")
let thaiFirstMatch = thaiMatchSequence.makeIterator().next()!
XCTAssert(thaiFirstMatch.matched == testInThai, "Failed to match Thai.")
}

func testSplit() {
let re = namesSplitPattern.r!
Expand Down Expand Up @@ -208,6 +269,7 @@ extension RegexTests {
("testReplaceAllWithReplacer", testReplaceAllWithReplacer),
("testReplaceFirst", testReplaceFirst),
("testReplaceFirstWithReplacer", testReplaceFirstWithReplacer),
("testGraphemeClusters", testGraphemeClusters),
("testSplit", testSplit),
("testSplitOnString", testSplitOnString),
("testSplitWithSubgroups", testSplitWithSubgroups),
Expand Down