Skip to content

Added Haitian Creole (ht) Language Support to spaCy #13807

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions spacy/lang/ht/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Callable, Optional

from thinc.api import Model

from ...language import BaseDefaults, Language
from .lemmatizer import HaitianCreoleLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP


class HaitianCreoleDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
tag_map = TAG_MAP

class HaitianCreole(Language):
lang = "ht"
Defaults = HaitianCreoleDefaults

@HaitianCreole.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return HaitianCreoleLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)

__all__ = ["HaitianCreole"]
18 changes: 18 additions & 0 deletions spacy/lang/ht/examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
Example sentences to test spaCy and its language models.

>>> from spacy.lang.ht.examples import sentences
>>> docs = nlp.pipe(sentences)
"""


sentences = [
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
"San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
"Lond se yon gwo vil nan Wayòm Ini",
"Kote ou ye?",
"Kilès ki prezidan Lafrans?",
"Ki kapital Etazini?",
"Kile Barack Obama te fèt?",
]
51 changes: 51 additions & 0 deletions spacy/lang/ht/lemmatizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typing import List, Tuple

from ...pipeline import Lemmatizer
from ...tokens import Token
from ...lookups import Lookups


class HaitianCreoleLemmatizer(Lemmatizer):
"""
Minimal Haitian Creole lemmatizer.
Returns a word's base form based on rules and lookup,
or defaults to the original form.
"""

def is_base_form(self, token: Token) -> bool:
morph = token.morph.to_dict()
upos = token.pos_.lower()

# Consider unmarked forms to be base
if upos in {"noun", "verb", "adj", "adv"}:
if not morph:
return True
if upos == "noun" and morph.get("Number") == "Sing":
return True
if upos == "verb" and morph.get("VerbForm") == "Inf":
return True
if upos == "adj" and morph.get("Degree") == "Pos":
return True
return False

def rule_lemmatize(self, token: Token) -> List[str]:
string = token.text.lower()
pos = token.pos_.lower()
cache_key = (token.orth, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]

forms = []

# fallback rule: just return lowercased form
forms.append(string)

self.cache[cache_key] = forms
return forms

@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule":
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
return (required, [])
return super().get_lookups_config(mode)
78 changes: 78 additions & 0 deletions spacy/lang/ht/lex_attrs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from ...attrs import LIKE_NUM, NORM

# Cardinal numbers in Creole
_num_words = set(
"""
zewo youn en de twa kat senk sis sèt uit nèf dis
onz douz trèz katoz kenz sèz disèt dizwit diznèf
vent trant karant sinkant swasant swasann-dis
san mil milyon milya
""".split()
)

# Ordinal numbers in Creole (some are French-influenced, some simplified)
_ordinal_words = set(
"""
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
""".split()
)

NORM_MAP = {
"'m": "mwen",
"'w": "ou",
"'l": "li",
"'n": "nou",
"'y": "yo",
"’m": "mwen",
"’w": "ou",
"’l": "li",
"’n": "nou",
"’y": "yo",
"m": "mwen",
"n": "nou",
"l": "li",
"y": "yo",
"w": "ou",
"t": "te",
"k": "ki",
"p": "pa",
"M": "Mwen",
"N": "Nou",
"L": "Li",
"Y": "Yo",
"W": "Ou",
"T": "Te",
"K": "Ki",
"P": "Pa",
}

def like_num(text):
text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
if text in _ordinal_words:
return True
# Handle things like "3yèm", "10yèm", "25yèm", etc.
if text.endswith("yèm") and text[:-3].isdigit():
return True
return False

def norm_custom(text):
return NORM_MAP.get(text, text.lower())

LEX_ATTRS = {
LIKE_NUM: like_num,
NORM: norm_custom,
}
43 changes: 43 additions & 0 deletions spacy/lang/ht/punctuation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
LIST_ELLIPSES,
LIST_ICONS,
merge_chars,
)

ELISION = "'’".replace(" ", "")

_prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper()

TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]

TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]

TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]
50 changes: 50 additions & 0 deletions spacy/lang/ht/stop_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
STOP_WORDS = set(
"""
a ak an ankò ant apre ap atò avan avanlè
byen bò byenke

chak

de depi deja deja

e en epi èske

fò fòk

gen genyen

ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman

la l laa le lè li lye lò

m m' mwen

nan nap nou n'

ou oumenm

pa paske pami pandan pito pou pral preske pwiske

se selman si sou sòt

ta tap tankou te toujou tou tan tout toutotan twòp tèl

w w' wi wè

y y' yo yon yonn

non o oh eh

sa san si swa si

men mèsi oswa osinon

"""
.split()
)

# Add common contractions, with and without apostrophe variants
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
for apostrophe in ["'", "’", "‘"]:
for word in contractions:
STOP_WORDS.add(word.replace("'", apostrophe))
74 changes: 74 additions & 0 deletions spacy/lang/ht/syntax_iterators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Iterator, Tuple, Union

from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span


def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse for Haitian Creole.
Works on both Doc and Span objects.
"""

# Core nominal dependencies common in Haitian Creole
labels = [
"nsubj",
"obj",
"obl",
"nmod",
"appos",
"ROOT",
]

# Modifiers to optionally include in chunk (to the right)
post_modifiers = ["compound", "flat", "flat:name", "fixed"]

doc = doclike.doc
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)

np_deps = {doc.vocab.strings.add(label) for label in labels}
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
conj_label = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
adp_pos = doc.vocab.strings.add("ADP")
cc_pos = doc.vocab.strings.add("CCONJ")

prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
if word.left_edge.i <= prev_end:
continue

if word.dep in np_deps:
right_end = word
# expand to include known modifiers to the right
for child in word.rights:
if child.dep in np_mods:
right_end = child.right_edge
elif child.pos == NOUN:
right_end = child.right_edge

left_index = word.left_edge.i
# Skip prepositions at the start
if word.left_edge.pos == adp_pos:
left_index += 1

prev_end = right_end.i
yield left_index, right_end.i + 1, np_label

elif word.dep == conj_label:
head = word.head
while head.dep == conj_label and head.head.i < head.i:
head = head.head
if head.dep in np_deps:
left_index = word.left_edge.i
if word.left_edge.pos == cc_pos:
left_index += 1
prev_end = word.i
yield left_index, word.i + 1, np_label


SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
21 changes: 21 additions & 0 deletions spacy/lang/ht/tag_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X

TAG_MAP = {
"NOUN": {"pos": NOUN},
"VERB": {"pos": VERB},
"AUX": {"pos": AUX},
"ADJ": {"pos": ADJ},
"ADV": {"pos": ADV},
"PRON": {"pos": PRON},
"DET": {"pos": DET},
"ADP": {"pos": ADP},
"SCONJ": {"pos": SCONJ},
"CCONJ": {"pos": CCONJ},
"PART": {"pos": PART},
"INTJ": {"pos": INTJ},
"NUM": {"pos": NUM},
"PROPN": {"pos": PROPN},
"PUNCT": {"pos": PUNCT},
"SYM": {"pos": SYM},
"X": {"pos": X},
}
Loading