Skip to content
Open
1 change: 1 addition & 0 deletions libs/text-splitters/langchain_text_splitters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ class Language(str, Enum):
SCALA = "scala"
SWIFT = "swift"
MARKDOWN = "markdown"
MYSQL = "mysql"
LATEX = "latex"
HTML = "html"
SOL = "sol"
Expand Down
36 changes: 36 additions & 0 deletions libs/text-splitters/langchain_text_splitters/character.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,42 @@ def get_separators_for_language(language: Language) -> list[str]:
" ",
"",
]
if language == Language.MYSQL:
return [
# Split along DDL statements
"\nCREATE TABLE ",
"\nCREATE VIEW ",
"\nCREATE PROCEDURE ",
"\nCREATE FUNCTION ",
"\nCREATE TRIGGER ",
"\nCREATE INDEX ",
"\nALTER TABLE ",
"\nDROP TABLE ",
"\nDROP VIEW ",
"\nTRUNCATE TABLE ",
# Split along DML statements
"\nINSERT INTO ",
"\nUPDATE ",
"\nDELETE ",
# Split along control flow statements
"\nDECLARE ",
"\nBEGIN ",
"\nEND ",
"\nIF ",
"\nCASE ",
"\nLOOP ",
"\nWHILE ",
"\nREPEAT ",
"\nLEAVE ",
"\nITERATE ",
# Split along other statements
"\nDELIMITER ",
# Split by the normal type of lines
"\n\n",
"\n",
" ",
"",
]
if language == Language.MARKDOWN:
return [
# First, try to split along Markdown headings (starting with level 2)
Expand Down
58 changes: 58 additions & 0 deletions libs/text-splitters/tests/unit_tests/test_text_splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2262,6 +2262,64 @@ def test_experimental_markdown_syntax_text_splitter_header_config_on_multi_files
assert output == expected_output


def test_mysql_query_text_splitter() -> None:
"""Test splitting by MySQL language."""
mysql_text = """CREATE TABLE users (
id INT AUTO_INCREMENT PRIMARY KEY,
username VARCHAR(50) NOT NULL,
email VARCHAR(100) NOT NULL
);

CREATE PROCEDURE GetUser(IN userId INT)
BEGIN
SELECT * FROM users WHERE id = userId;
END;

INSERT INTO users (username, email) VALUES ('testuser', 'test@example.com');

DELIMITER //
CREATE TRIGGER before_insert_users
BEFORE INSERT ON users
FOR EACH ROW
BEGIN
IF NEW.username IS NULL THEN
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'Username cannot be null';
END IF;
END;
//
DELIMITER ;"""
expected_docs = [
Document(
page_content=(
"CREATE TABLE users (\n id INT AUTO_INCREMENT PRIMARY KEY,\n"
" username VARCHAR(50) NOT NULL,\n"
" email VARCHAR(100) NOT NULL\n);"
),
metadata={"source": "source-1"},
),
Document(
page_content=(
"CREATE PROCEDURE GetUser(IN userId INT)\nBEGIN\n"
" SELECT * FROM users WHERE id = userId;\nEND;\n\n"
"INSERT INTO users (username, email) VALUES "
"('testuser', 'test@example.com');\n\n"
"DELIMITER //\nCREATE TRIGGER before_insert_users\n"
"BEFORE INSERT ON users\nFOR EACH ROW\nBEGIN\n"
" IF NEW.username IS NULL THEN\n"
" SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = "
"'Username cannot be null';\n"
" END IF;\nEND;\n//\nDELIMITER ;"
),
metadata={"source": "source-1"},
),
]
splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MYSQL, chunk_size=500, chunk_overlap=0
)
docs = splitter.create_documents([mysql_text], [{"source": "source-1"}])
assert docs == expected_docs


def test_solidity_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0
Expand Down