explosion · JephteyAdolphe · Apr 27, 2025 · Apr 27, 2025 · Apr 28, 2025
diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py
@@ -0,0 +1,52 @@
+from typing import Callable, Optional
+
+from thinc.api import Model
+
+from ...language import BaseDefaults, Language
+from .lemmatizer import HaitianCreoleLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .tag_map import TAG_MAP
+
+
+class HaitianCreoleDefaults(BaseDefaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
+    stop_words = STOP_WORDS
+    tag_map = TAG_MAP
+
+class HaitianCreole(Language):
+    lang = "ht"
+    Defaults = HaitianCreoleDefaults
+
+@HaitianCreole.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
+):
+    return HaitianCreoleLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
+
+__all__ = ["HaitianCreole"]
diff --git a/spacy/lang/ht/examples.py b/spacy/lang/ht/examples.py
@@ -0,0 +1,18 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ht.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
+    "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
+    "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
+    "Lond se yon gwo vil nan Wayòm Ini",
+    "Kote ou ye?",
+    "Kilès ki prezidan Lafrans?",
+    "Ki kapital Etazini?",
+    "Kile Barack Obama te fèt?",
+]
diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py
@@ -0,0 +1,51 @@
+from typing import List, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+from ...lookups import Lookups
+
+
+class HaitianCreoleLemmatizer(Lemmatizer):
+    """
+    Minimal Haitian Creole lemmatizer.
+    Returns a word's base form based on rules and lookup,
+    or defaults to the original form.
+    """
+
+    def is_base_form(self, token: Token) -> bool:
+        morph = token.morph.to_dict()
+        upos = token.pos_.lower()
+
+        # Consider unmarked forms to be base
+        if upos in {"noun", "verb", "adj", "adv"}:
+            if not morph:
+                return True
+            if upos == "noun" and morph.get("Number") == "Sing":
+                return True
+            if upos == "verb" and morph.get("VerbForm") == "Inf":
+                return True
+            if upos == "adj" and morph.get("Degree") == "Pos":
+                return True
+        return False
+
+    def rule_lemmatize(self, token: Token) -> List[str]:
+        string = token.text.lower()
+        pos = token.pos_.lower()
+        cache_key = (token.orth, token.pos)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+
+        forms = []
+
+        # fallback rule: just return lowercased form
+        forms.append(string)
+
+        self.cache[cache_key] = forms
+        return forms
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+        if mode == "rule":
+            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+            return (required, [])
+        return super().get_lookups_config(mode)
diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py
@@ -0,0 +1,78 @@
+from ...attrs import LIKE_NUM, NORM
+
+# Cardinal numbers in Creole
+_num_words = set(
+    """
+zewo youn en de twa kat senk sis sèt uit nèf dis
+onz douz trèz katoz kenz sèz disèt dizwit diznèf
+vent trant karant sinkant swasant swasann-dis
+san mil milyon milya
+""".split()
+)
+
+# Ordinal numbers in Creole (some are French-influenced, some simplified)
+_ordinal_words = set(
+    """
+premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
+onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
+ventyèm trantyèm karantyèm sinkantyèm swasantyèm
+swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
+""".split()
+)
+
+NORM_MAP = {
+    "'m": "mwen",
+    "'w": "ou",
+    "'l": "li",
+    "'n": "nou",
+    "'y": "yo",
+    "’m": "mwen",
+    "’w": "ou",
+    "’l": "li",
+    "’n": "nou",
+    "’y": "yo",
+    "m": "mwen",
+    "n": "nou",
+    "l": "li",
+    "y": "yo",
+    "w": "ou",
+    "t": "te",
+    "k": "ki",
+    "p": "pa",
+    "M": "Mwen",
+    "N": "Nou",
+    "L": "Li",
+    "Y": "Yo",
+    "W": "Ou",
+    "T": "Te",
+    "K": "Ki",
+    "P": "Pa",
+}
+
+def like_num(text):
+    text = text.strip().lower()
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    if text in _ordinal_words:
+        return True
+    # Handle things like "3yèm", "10yèm", "25yèm", etc.
+    if text.endswith("yèm") and text[:-3].isdigit():
+        return True
+    return False
+
+def norm_custom(text):
+    return NORM_MAP.get(text, text.lower())
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num,
+    NORM: norm_custom,
+}
diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py
@@ -0,0 +1,43 @@
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    merge_chars,
+)
+
+ELISION = "'’".replace(" ", "")
+
+_prefixes_elision = "m n l y t k w"
+_prefixes_elision += " " + _prefixes_elision.upper()
+
+TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
+    r"(?:({pe})[{el}])(?=[{a}])".format(
+        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
+    )
+]
+
+TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
+    r"(?<=[0-9])%",  # numbers like 10%
+    r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
+    r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
+    r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
+    r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
+    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string
+    r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
+]
+
+TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
+    r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+    r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+    ),
+    r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+    r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+]
diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py
@@ -0,0 +1,50 @@
+STOP_WORDS = set(
+    """
+a ak an ankò ant apre ap atò avan avanlè
+byen bò byenke
+
+chak
+
+de depi deja deja
+
+e en epi èske
+
+fò fòk
+
+gen genyen
+
+ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
+
+la l laa le lè li lye lò
+
+m m' mwen
+
+nan nap nou n'
+
+ou oumenm
+
+pa paske pami pandan pito pou pral preske pwiske
+
+se selman si sou sòt
+
+ta tap tankou te toujou tou tan tout toutotan twòp tèl
+
+w w' wi wè
+
+y y' yo yon yonn
+
+non o oh eh
+
+sa san si swa si
+
+men mèsi oswa osinon
+
+"""
+.split()
+)
+
+# Add common contractions, with and without apostrophe variants
+contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
+for apostrophe in ["'", "’", "‘"]:
+    for word in contractions:
+        STOP_WORDS.add(word.replace("'", apostrophe))
diff --git a/spacy/lang/ht/syntax_iterators.py b/spacy/lang/ht/syntax_iterators.py
@@ -0,0 +1,74 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse for Haitian Creole.
+    Works on both Doc and Span objects.
+    """
+
+    # Core nominal dependencies common in Haitian Creole
+    labels = [
+        "nsubj",
+        "obj",
+        "obl",
+        "nmod",
+        "appos",
+        "ROOT",
+    ]
+
+    # Modifiers to optionally include in chunk (to the right)
+    post_modifiers = ["compound", "flat", "flat:name", "fixed"]
+
+    doc = doclike.doc
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
+    conj_label = doc.vocab.strings.add("conj")
+    np_label = doc.vocab.strings.add("NP")
+    adp_pos = doc.vocab.strings.add("ADP")
+    cc_pos = doc.vocab.strings.add("CCONJ")
+
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        if word.left_edge.i <= prev_end:
+            continue
+
+        if word.dep in np_deps:
+            right_end = word
+            # expand to include known modifiers to the right
+            for child in word.rights:
+                if child.dep in np_mods:
+                    right_end = child.right_edge
+                elif child.pos == NOUN:
+                    right_end = child.right_edge
+
+            left_index = word.left_edge.i
+            # Skip prepositions at the start
+            if word.left_edge.pos == adp_pos:
+                left_index += 1
+
+            prev_end = right_end.i
+            yield left_index, right_end.i + 1, np_label
+
+        elif word.dep == conj_label:
+            head = word.head
+            while head.dep == conj_label and head.head.i < head.i:
+                head = head.head
+            if head.dep in np_deps:
+                left_index = word.left_edge.i
+                if word.left_edge.pos == cc_pos:
+                    left_index += 1
+                prev_end = word.i
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py
@@ -0,0 +1,21 @@
+from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
+
+TAG_MAP = {
+    "NOUN": {"pos": NOUN},
+    "VERB": {"pos": VERB},
+    "AUX": {"pos": AUX},
+    "ADJ": {"pos": ADJ},
+    "ADV": {"pos": ADV},
+    "PRON": {"pos": PRON},
+    "DET": {"pos": DET},
+    "ADP": {"pos": ADP},
+    "SCONJ": {"pos": SCONJ},
+    "CCONJ": {"pos": CCONJ},
+    "PART": {"pos": PART},
+    "INTJ": {"pos": INTJ},
+    "NUM": {"pos": NUM},
+    "PROPN": {"pos": PROPN},
+    "PUNCT": {"pos": PUNCT},
+    "SYM": {"pos": SYM},
+    "X": {"pos": X},
+}