chore: remove unused code and class in text splitter (langgenius#4864)

ocxers · Jun 3, 2024 · 5d15aca · 5d15aca
1 parent b98a1a3
commit 5d15aca
Showing 1 changed file with 0 additions and 369 deletions.
diff --git a/api/core/splitter/text_splitter.py b/api/core/splitter/text_splitter.py
@@ -6,7 +6,6 @@
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Collection, Iterable, Sequence, Set
 from dataclasses import dataclass
-from enum import Enum
 from typing import (
     Any,
     Literal,
@@ -477,27 +476,6 @@ def _encode(_text: str) -> list[int]:
         return split_text_on_tokens(text=text, tokenizer=tokenizer)
 
 
-class Language(str, Enum):
-    """Enum of the programming languages."""
-
-    CPP = "cpp"
-    GO = "go"
-    JAVA = "java"
-    JS = "js"
-    PHP = "php"
-    PROTO = "proto"
-    PYTHON = "python"
-    RST = "rst"
-    RUBY = "ruby"
-    RUST = "rust"
-    SCALA = "scala"
-    SWIFT = "swift"
-    MARKDOWN = "markdown"
-    LATEX = "latex"
-    HTML = "html"
-    SOL = "sol"
-
-
 class RecursiveCharacterTextSplitter(TextSplitter):
     """Splitting text by recursively look at characters.
 
@@ -554,350 +532,3 @@ def _split_text(self, text: str, separators: list[str]) -> list[str]:
 
     def split_text(self, text: str) -> list[str]:
         return self._split_text(text, self._separators)
-
-    @classmethod
-    def from_language(
-            cls, language: Language, **kwargs: Any
-    ) -> RecursiveCharacterTextSplitter:
-        separators = cls.get_separators_for_language(language)
-        return cls(separators=separators, **kwargs)
-
-    @staticmethod
-    def get_separators_for_language(language: Language) -> list[str]:
-        if language == Language.CPP:
-            return [
-                # Split along class definitions
-                "\nclass ",
-                # Split along function definitions
-                "\nvoid ",
-                "\nint ",
-                "\nfloat ",
-                "\ndouble ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.GO:
-            return [
-                # Split along function definitions
-                "\nfunc ",
-                "\nvar ",
-                "\nconst ",
-                "\ntype ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.JAVA:
-            return [
-                # Split along class definitions
-                "\nclass ",
-                # Split along method definitions
-                "\npublic ",
-                "\nprotected ",
-                "\nprivate ",
-                "\nstatic ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.JS:
-            return [
-                # Split along function definitions
-                "\nfunction ",
-                "\nconst ",
-                "\nlet ",
-                "\nvar ",
-                "\nclass ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nswitch ",
-                "\ncase ",
-                "\ndefault ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.PHP:
-            return [
-                # Split along function definitions
-                "\nfunction ",
-                # Split along class definitions
-                "\nclass ",
-                # Split along control flow statements
-                "\nif ",
-                "\nforeach ",
-                "\nwhile ",
-                "\ndo ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.PROTO:
-            return [
-                # Split along message definitions
-                "\nmessage ",
-                # Split along service definitions
-                "\nservice ",
-                # Split along enum definitions
-                "\nenum ",
-                # Split along option definitions
-                "\noption ",
-                # Split along import statements
-                "\nimport ",
-                # Split along syntax declarations
-                "\nsyntax ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.PYTHON:
-            return [
-                # First, try to split along class definitions
-                "\nclass ",
-                "\ndef ",
-                "\n\tdef ",
-                # Now split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.RST:
-            return [
-                # Split along section titles
-                "\n=+\n",
-                "\n-+\n",
-                "\n\\*+\n",
-                # Split along directive markers
-                "\n\n.. *\n\n",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.RUBY:
-            return [
-                # Split along method definitions
-                "\ndef ",
-                "\nclass ",
-                # Split along control flow statements
-                "\nif ",
-                "\nunless ",
-                "\nwhile ",
-                "\nfor ",
-                "\ndo ",
-                "\nbegin ",
-                "\nrescue ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.RUST:
-            return [
-                # Split along function definitions
-                "\nfn ",
-                "\nconst ",
-                "\nlet ",
-                # Split along control flow statements
-                "\nif ",
-                "\nwhile ",
-                "\nfor ",
-                "\nloop ",
-                "\nmatch ",
-                "\nconst ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.SCALA:
-            return [
-                # Split along class definitions
-                "\nclass ",
-                "\nobject ",
-                # Split along method definitions
-                "\ndef ",
-                "\nval ",
-                "\nvar ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nmatch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.SWIFT:
-            return [
-                # Split along function definitions
-                "\nfunc ",
-                # Split along class definitions
-                "\nclass ",
-                "\nstruct ",
-                "\nenum ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\ndo ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.MARKDOWN:
-            return [
-                # First, try to split along Markdown headings (starting with level 2)
-                "\n#{1,6} ",
-                # Note the alternative syntax for headings (below) is not handled here
-                # Heading level 2
-                # ---------------
-                # End of code block
-                "```\n",
-                # Horizontal lines
-                "\n\\*\\*\\*+\n",
-                "\n---+\n",
-                "\n___+\n",
-                # Note that this splitter doesn't handle horizontal lines defined
-                # by *three or more* of ***, ---, or ___, but this is not handled
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.LATEX:
-            return [
-                # First, try to split along Latex sections
-                "\n\\\\chapter{",
-                "\n\\\\section{",
-                "\n\\\\subsection{",
-                "\n\\\\subsubsection{",
-                # Now split by environments
-                "\n\\\begin{enumerate}",
-                "\n\\\begin{itemize}",
-                "\n\\\begin{description}",
-                "\n\\\begin{list}",
-                "\n\\\begin{quote}",
-                "\n\\\begin{quotation}",
-                "\n\\\begin{verse}",
-                "\n\\\begin{verbatim}",
-                # Now split by math environments
-                "\n\\\begin{align}",
-                "$$",
-                "$",
-                # Now split by the normal type of lines
-                " ",
-                "",
-            ]
-        elif language == Language.HTML:
-            return [
-                # First, try to split along HTML tags
-                "<body",
-                "<div",
-                "<p",
-                "<br",
-                "<li",
-                "<h1",
-                "<h2",
-                "<h3",
-                "<h4",
-                "<h5",
-                "<h6",
-                "<span",
-                "<table",
-                "<tr",
-                "<td",
-                "<th",
-                "<ul",
-                "<ol",
-                "<header",
-                "<footer",
-                "<nav",
-                # Head
-                "<head",
-                "<style",
-                "<script",
-                "<meta",
-                "<title",
-                "",
-            ]
-        elif language == Language.SOL:
-            return [
-                # Split along compiler information definitions
-                "\npragma ",
-                "\nusing ",
-                # Split along contract definitions
-                "\ncontract ",
-                "\ninterface ",
-                "\nlibrary ",
-                # Split along method definitions
-                "\nconstructor ",
-                "\ntype ",
-                "\nfunction ",
-                "\nevent ",
-                "\nmodifier ",
-                "\nerror ",
-                "\nstruct ",
-                "\nenum ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\ndo while ",
-                "\nassembly ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        else:
-            raise ValueError(
-                f"Language {language} is not supported! "
-                f"Please choose from {list(Language)}"
-            )