Language-agnostic sentence boundary detector using regex patterns.
A universal splitter using Unicode-aware regex patterns for any language.
Handles
- Unicode sentence terminators
- Numbered lists and headings
- Quoted sentences
- Line breaks and whitespace
Use cases
- Primary splitter for languages without dedicated support
- Fallback when language-specific splitters unavailable
Methods:
-
split
–
Splits text into sentences using rule-based regex patterns.
Source code in src/chunklet/sentence_splitter/_universal_splitter.py
| def __init__(self):
self.sentence_terminators = "".join(GLOBAL_SENTENCE_TERMINATORS)
self.flattened_numbered_list_pattern = re.compile(
rf"(?<=[{self.sentence_terminators}:])\s+(\p{{N}}\.)+"
)
self.quote_or_paren_pattern = re.compile(
r"(\p{Pi}|['\"]).+?(\p{Pf}|\1)|"
r"\p{Ps}.+?\p{Pe}",
re.DOTALL,
)
self.hashed_pattern = re.compile(r"##-?\d+##")
self.numbered_list_pattern = re.compile(r"[\n:]\s*\p{N}\.")
# Core sentence split regex
self.sentence_end_pattern = re.compile(
rf"""
(?<!\b(\p{{Lu}}\p{{Ll}}{{1, 4}}\.)*) # Latin-only abbreviation
(?<=[{self.sentence_terminators}]) # sentence-ending punctuation
(?=\s+[\p{{Lu}}\p{{Lo}}\p{{Lt}}]|\s*\n|\s*$) # followed by letter (upper or catch-all) or end
""",
re.VERBOSE,
)
|
split
split(text: str) -> list[str]
Splits text into sentences using rule-based regex patterns.
Parameters:
-
text
(str)
–
The input text to be segmented into sentences.
Returns:
-
list[str]
–
A list of sentences after segmentation.
Source code in src/chunklet/sentence_splitter/_universal_splitter.py
| def split(self, text: str) -> list[str]:
"""
Splits text into sentences using rule-based regex patterns.
Args:
text: The input text to be segmented into sentences.
Returns:
A list of sentences after segmentation.
"""
def mask(match: re.Match, norm_map: dict):
# Generate the integer hash and Convert to string
# because re.sub MUST return a string
# Also fence them for easy detection
hashed_str = f"##{hash(match.group())}##"
# Store the mapping for later reconstruction
norm_map[hashed_str] = match.group()
return hashed_str
def unmask(match: re.Match, norm_map: dict):
return norm_map.get(match.group(), match.group())
text = self.flattened_numbered_list_pattern.sub(r"\n \1", text.strip())
# Normalize to protect them
norm_map = {}
text = self.quote_or_paren_pattern.sub(
lambda m: mask(m, norm_map), text
)
text = self.numbered_list_pattern.sub(
lambda m: mask(m, norm_map), text
)
# Firstly, split base on punctuation
# then split further on newline
final_sentences = []
sentences = self.sentence_end_pattern.split(text.strip())
for sent in sentences:
if sent:
final_sentences.extend(sent.strip().splitlines())
# Restore the normalization
return [
self.hashed_pattern.sub(lambda m: unmask(m, norm_map), sent)
for sent in final_sentences if sent.strip()
]
|