API Reference

v0.8.2

Public API of kham-core — available for Rust, Python (PyO3), WASM (JavaScript / TypeScript), and C FFI.

docs.rs/kham-core ↗ PyPI: kham ↗ npm: kham-wasm ↗ kham.h (C FFI) ↗

Module / Type	Rust path	Description
Tokenizer	`kham_core::Tokenizer`	Core Thai word segmenter — maximal matching over a built-in DAWG dictionary.
Token / TokenKind	`kham_core::Token`	Zero-copy token slice with byte spans, char spans, and script category.
normalizer	`kham_core::normalizer`	Thai text normalization: deduplicate tone marks, compose sara am (อำ).
FtsTokenizer	`kham_core::fts::FtsTokenizer`	Full-text search pipeline: stopwords, synonyms, POS, NE, soundex in one call.
PosTagger	`kham_core::pos::PosTagger`	13-category POS tagger derived from the ORCHID tagset.
NeTagger	`kham_core::ne::NeTagger`	Named entity recognition: Person, Place, Org via built-in gazetteer.
RomanizationMap	`kham_core::romanizer`	RTGS romanization lookup — Thai word → Latin transliteration.
number	`kham_core::number`	Thai numeral utilities: parse Thai digit strings, word-to-number, baht text.
sentence	`kham_core::sentence`	Sentence boundary detection — split a paragraph into sentence spans.
soundex	`kham_core::soundex`	Phonetic codes: lk82, udom83, MetaSound, and cross-language Thai–English.
SpellChecker	`kham_core::spell::SpellChecker`	Spell correction: Levenshtein ≤ 2 candidates ranked by lk82 soundex + TNC frequency.
KeyExtractor	`kham_core::keyword::KeyExtractor`	Keyword extraction: TF × inverse-corpus-frequency, stopwords excluded.

Tokenizer

docs.rs ↗

High-level tokenizer backed by a compressed DAWG dictionary and TNC frequency table. Uses the newmm (maximal matching) algorithm with TCC boundaries. Rust tokens are zero-copy slices of the input string.

use kham_core::Tokenizer;

let tok = Tokenizer::new();

// Simple — list of token strings
let words: Vec<&str> = tok.segment("กินข้าวกับปลา")
    .into_iter().map(|t| t.text).collect();
// ["กิน", "ข้าว", "กับ", "ปลา"]

// Rich — Token structs with span info
let tokens = tok.segment("ธนาคาร100แห่ง");
for t in &tokens {
    println!("{:8} chars={}..{} kind={:?}", t.text,
        t.char_span.start, t.char_span.end, t.kind);
}

// Custom dictionary — merge extra words with built-in
let tok2 = Tokenizer::builder()
    .dict_words("ปัญญาประดิษฐ์\nแมชชีนเลิร์นนิง\n")
    .build();

use kham_core::{Tokenizer, TokenStream};

let tok = Tokenizer::new();

// Streaming — consume one token at a time
let mut stream = tok.segment_stream("ธนาคาร100แห่ง");

// next_word() — skip whitespace
while let Some(t) = stream.next_word() {
    println!("{} ({:?})", t.text, t.kind);
}

// next_above_confidence(0.8) — skip low-confidence tokens
let mut stream = tok.segment_stream("ธนาคาร100แห่ง");
while let Some(t) = stream.next_above_confidence(0.8) {
    println!("{} conf={:.2}", t.text, t.confidence);
}

import kham

# Simple — list of strings
words = kham.segment("กินข้าวกับปลา")
# ['กิน', 'ข้าว', 'กับ', 'ปลา']

# Rich — Token objects (text, byte_start/end, char_start/end, kind)
for t in kham.segment_tokens("ธนาคาร100แห่ง"):
    print(t.text, t.char_start, t.char_end, t.kind)
# ธนาคาร  0  6  Thai
# 100      6  9  Number
# แห่ง     9 13  Thai

# Normalize before segmenting
text = kham.normalize("ข้้าว")   # "ข้าว"  (dedup tone mark)
words = kham.segment(text)

# segment_above_confidence — only tokens with confidence ≥ threshold
for t in kham.segment_above_confidence("ธนาคาร100แห่ง", 0.8):
    print(t.text, t.confidence)

import init, { segment, segment_tokens, normalize, segment_above_confidence } from '/wasm/kham_wasm.js';
await init(); // call once on page load

// Simple — JS Array of strings
const words = segment("กินข้าวกับปลา");
// ["กิน", "ข้าว", "กับ", "ปลา"]

// Rich — Token objects (text, charStart, charEnd, byteStart, byteEnd, kind)
const tokens = segment_tokens("ธนาคาร100แห่ง");
for (const t of tokens) {
  console.log(t.text, t.char_start, t.char_end, t.kind);
}

// Normalize before segmenting
const clean = normalize("ข้้าว");   // "ข้าว"
const toks  = segment(clean);

// segment_above_confidence — only tokens with confidence ≥ threshold
const high = segment_above_confidence("ธนาคาร100แห่ง", 0.8);
for (const t of high) console.log(t.text, t.confidence);

#include "kham.h"

// Simple — array of token strings (legacy API)
KhamTokens *toks = kham_segment("กินข้าวกับปลา");
for (size_t i = 0; i < toks->len; i++)
    printf("%s\n", toks->words[i]);
kham_tokens_free(toks);

// Rich — KhamToken with byte/char spans and kind
KhamTokenList *list = kham_segment_tokens("ธนาคาร100แห่ง");
for (size_t i = 0; i < list->len; i++) {
    KhamToken t = list->tokens[i];
    printf("%s  chars=%zu..%zu  %s\n",
           t.text, t.char_start, t.char_end, t.kind);
}
kham_token_list_free(list);

Rust key methods: Tokenizer::new() Tokenizer::builder() .segment(&str) → Vec<Token>

Token / TokenKind

docs.rs ↗

Every segment() call returns tokens carrying the original text plus two span types: byte offsets (Rust slicing) and Unicode scalar-value offsets (Python / JS indexing).

use kham_core::{TokenKind, NamedEntityKind, Tokenizer};

let tok = Tokenizer::new();
let input = "ธนาคาร100แห่ง";
let tokens = tok.segment(input);

for t in &tokens {
    // t.text       — &str (zero-copy slice of input)
    // t.span       — Range<usize> byte offsets
    // t.char_span  — Range<usize> Unicode scalar-value offsets
    // t.kind       — TokenKind
    // t.confidence — f32: 0.0 (Unknown) … 1.0 (high-confidence dict match)

    assert_eq!(&input[t.span.clone()], t.text);
}

// TokenKind variants:
// Thai | Latin | Number | Punctuation | Emoji | Whitespace | Unknown
// Named(NamedEntityKind::Person | Place | Org)  ← set by NeTagger

import kham

tokens = kham.segment_tokens("ธนาคาร100แห่ง")

for t in tokens:
    # t.text        — str
    # t.byte_start / t.byte_end  — UTF-8 byte offsets
    # t.char_start / t.char_end  — Unicode scalar-value offsets
    # t.kind        — str: "Thai" | "Latin" | "Number" | "Punctuation"
    #                       "Emoji" | "Whitespace" | "Unknown"
    #                       "Person" | "Place" | "Org"  (Named entities)
    # t.confidence  — float: 0.0 (Unknown) … 1.0 (high-confidence dict match)
    print(f"{t.text!r:12} kind={t.kind}  chars={t.char_start}..{t.char_end}  conf={t.confidence:.2f}")

import init, { segment_tokens } from '/wasm/kham_wasm.js';
await init();

const tokens = segment_tokens("ธนาคาร100แห่ง");

for (const t of tokens) {
  // t.text        — string
  // t.byte_start / t.byte_end  — UTF-8 byte offsets
  // t.char_start / t.char_end  — Unicode scalar-value offsets
  // t.kind        — "Thai" | "Latin" | "Number" | "Punctuation"
  //                 "Emoji" | "Whitespace" | "Unknown"
  //                 "Person" | "Place" | "Org"  (Named entities)
  // t.confidence  — number: 0.0 (Unknown) … 1.0 (high-confidence dict match)
  console.log(t.text, t.kind, t.char_start, t.char_end, t.confidence);
}

#include "kham.h"

// KhamToken fields:
//   text        — char* (null-terminated UTF-8)
//   byte_start / byte_end  — size_t byte offsets
//   char_start / char_end  — size_t Unicode scalar-value offsets
//   kind        — char*: "Thai" | "Latin" | "Number" | "Punctuation"
//                        "Emoji" | "Whitespace" | "Unknown"
//                        "Person" | "Place" | "Org"  (FTS pipeline only)
//   confidence  — float: 0.0 (Unknown) … 1.0 (high-confidence dict match)

KhamTokenList *list = kham_segment_tokens("ธนาคาร100แห่ง");
for (size_t i = 0; i < list->len; i++) {
    KhamToken *t = &list->tokens[i];
    printf("%-10s  bytes=%zu..%zu  chars=%zu..%zu  kind=%s\n",
           t->text, t->byte_start, t->byte_end,
           t->char_start, t->char_end, t->kind);
}
kham_token_list_free(list);

normalizer

docs.rs ↗

Two-rule normalization pass: (1) deduplicate consecutive tone marks — keep the last one; (2) compose nikhahit (อํ U+0E4D) + sara aa (อา U+0E32) into sara am (อำ U+0E33). Call before segmenting when input may come from user keyboards or OCR.

use kham_core::normalizer::normalize;

// Rule 1 — deduplicate tone marks (keep last)
assert_eq!(normalize("ข้้าว"), "ข้าว");   // doubled mai tho → single
assert_eq!(normalize("ก่้"),   "ก้");      // mai ek + mai tho → mai tho

// Rule 2 — sara am composition
// nikhahit (U+0E4D) + sara aa (U+0E32) → sara am (U+0E33)
let decomposed = "\u{0E01}\u{0E4D}\u{0E32}"; // กํา (two codepoints)
assert_eq!(normalize(decomposed), "กำ");          // กำ  (one codepoint)

// Already canonical — returned unchanged (no allocation)
assert_eq!(normalize("กินข้าว"), "กินข้าว");

import kham

# Rule 1 — deduplicate tone marks
kham.normalize("ข้้าว")   # → "ข้าว"

# Rule 2 — sara am composition
# nikhahit (U+0E4D) + sara aa (U+0E32) → sara am (U+0E33)
kham.normalize("\u0e01\u0e4d\u0e32")  # กํา → กำ

# Already canonical — returned unchanged
kham.normalize("กินข้าว")  # → "กินข้าว"

# Typical usage: normalize before segmenting
words = kham.segment(kham.normalize(raw_input))

import init, { normalize, segment } from '/wasm/kham_wasm.js';
await init();

// Rule 1 — deduplicate tone marks
normalize("ข้้าว");   // → "ข้าว"

// Rule 2 — sara am composition
// nikhahit (U+0E4D) + sara aa (U+0E32) → sara am (U+0E33)
normalize("\u0E01\u0E4D\u0E32");  // กํา → กำ

// Already canonical — returned unchanged
normalize("กินข้าว");  // → "กินข้าว"

// Typical usage: normalize before segmenting
const words = segment(normalize(rawInput));

#include "kham.h"

// kham_normalize returns a heap-allocated string — free with kham_string_free

char *out = kham_normalize("ข้้าว");   // "ข้าว"  (dedup tone mark)
printf("%s\n", out);
kham_string_free(out);

// Sara am composition: nikhahit + sara aa → sara am
char *out2 = kham_normalize("\xe0\xb8\x81\xe0\xb9\x8d\xe0\xb8\xb2"); // กํา
printf("%s\n", out2);   // กำ
kham_string_free(out2);

// Typical pattern: normalize then segment
char *norm = kham_normalize(raw_input);
KhamTokens *toks = kham_segment(norm);
kham_string_free(norm);
// ... use toks ...
kham_tokens_free(toks);

FtsTokenizer

docs.rs ↗

Full NLP pipeline in a single pass: normalize → segment → NE → stopwords → POS → synonyms → romanization. In Python and WASM this is the primary way to access POS and NE metadata.

use kham_core::fts::FtsTokenizer;
use kham_core::soundex::SoundexAlgorithm;
use kham_core::synonym::SynonymMap;

// Default pipeline
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("นายกรัฐมนตรีกินข้าว");
for t in &tokens {
    println!("{:8} pos={:?} ne={:?} stop={}", t.text, t.pos, t.ne, t.is_stop);
}

// index_tokens: preserve positions, filter stopwords for phrase search
let indexed = fts.index_tokens("กินข้าวกับปลา");

// lexemes: flat Vec<String> of text + synonyms + trigrams (for tsvector)
let lexemes = fts.lexemes("กินข้าวกับปลา");

// Custom pipeline
let fts2 = FtsTokenizer::builder()
    .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n"))
    .soundex(SoundexAlgorithm::Lk82)
    .build();

import kham

# FtsToken fields: text, position, kind, is_stop, roman,
#                  pos, ne, synonyms (list), trigrams (list), confidence (float)
for t in kham.segment_fts("นายกรัฐมนตรีกินข้าว"):
    print(f"{t.text:10} pos={t.pos!r:15} ne={t.ne!r} stop={t.is_stop}")

# POS tags: "Noun" | "Verb" | "Adj" | "Adv" | "Particle" | "ProperNoun"
#           "Pronoun" | "Numeral" | "Classifier" | "Conjunction"
#           "Auxiliary" | "Determiner" | "Preposition"  (None if OOV)
# NE tags:  "Person" | "Place" | "Org"  (None if not in gazetteer)

# Romanization is included in every FtsToken
for t in kham.segment_fts("กินข้าว"):
    print(t.text, "→", t.roman)   # กิน → kin

import init, { segment_fts } from '/wasm/kham_wasm.js';
await init();

// FtsToken properties: text, position, kind, is_stop, roman,
//                      pos (string|null), ne (string|null),
//                      synonyms (string[]), trigrams (string[]),
//                      confidence (number: 0.0…1.0)
const tokens = segment_fts("นายกรัฐมนตรีกินข้าว");
for (const t of tokens) {
  console.log(t.text, '| pos:', t.pos, '| ne:', t.ne,
              '| stop:', t.is_stop, '| roman:', t.roman);
}

// POS tags: "Noun" | "Verb" | "Adj" | "Adv" | "Particle" | "ProperNoun"
//           "Pronoun" | "Numeral" | "Classifier" | "Conjunction"
//           "Auxiliary" | "Determiner" | "Preposition"  (null if OOV)
// NE tags:  "Person" | "Place" | "Org"  (null if not in gazetteer)

#include "kham.h"

// KhamFtsToken fields:
//   text, position, kind, is_stop, roman (char*, never NULL)
//   pos (char* or NULL if OOV), ne (char* or NULL)
//   synonyms / synonyms_len, trigrams / trigrams_len

KhamFtsTokenList *list = kham_fts_segment("นายกรัฐมนตรีกินข้าว");
for (size_t i = 0; i < list->len; i++) {
    KhamFtsToken *t = &list->tokens[i];
    printf("%-10s  pos=%-14s  ne=%-8s  stop=%d  roman=%s\n",
           t->text,
           t->pos  ? t->pos  : "(none)",
           t->ne   ? t->ne   : "(none)",
           t->is_stop, t->roman);
}
kham_fts_token_list_free(list);

// Flat lexeme array (for PostgreSQL tsvector / SQLite FTS5)
size_t len;
char **lex = kham_fts_lexemes("กินข้าวกับปลา", &len);
for (size_t i = 0; i < len; i++) printf("%s\n", lex[i]);
kham_fts_lexemes_free(lex, len);

FtsToken fields: textpositionkindis_stopromanposnesynonymstrigrams

PosTagger

docs.rs ↗

Dictionary-lookup POS tagger using a 13-category tagset derived from the ORCHID corpus. In Python, WASM, and C, POS tags are accessed via segment_fts() / kham_fts_segment() — direct tagger construction is Rust-only.

Tag	Category	Examples
NOUN	Noun	คน บ้าน ปลา
VERB	Verb	กิน ทำ ไป
ADJ	Adjective	ดี ใหญ่ สวย
ADV	Adverb	มาก เร็ว เสมอ
PART	Particle	ครับ ค่ะ นะ
PROPN	Proper noun	กรุงเทพ ไทย
PRON	Pronoun	ฉัน เขา เรา
NUM	Numeral	หนึ่ง สิบ ร้อย
CLAS	Classifier	ตัว ใบ อัน
CONJ	Conjunction	และ หรือ แต่
AUX	Auxiliary	ได้ ต้อง กำลัง
DET	Determiner	นี้ นั้น ทุก
PREP	Preposition	ใน บน ตาม

use kham_core::pos::PosTagger;

let tagger = PosTagger::builtin();

// Tag a single word
if let Some(pos) = tagger.tag("กิน") {
    println!("{:?}", pos); // Verb
}

// Custom tagger from TSV: word<TAB>POS_TAG
let custom = PosTagger::from_tsv("GPT\tNOUN\nแชทบอท\tNOUN\n");
assert_eq!(custom.tag("แชทบอท"), Some(kham_core::pos::PosTag::Noun));

import kham

# POS tagging is available through segment_fts()
# t.pos returns a string tag or None for OOV / non-Thai tokens

for t in kham.segment_fts("นักเรียนกินข้าวกับปลา"):
    if t.pos:
        print(f"{t.text:8} → {t.pos}")
# นักเรียน → Noun
# กิน      → Verb
# ข้าว     → Noun
# กับ      → Preposition
# ปลา      → Noun

import init, { segment_fts } from '/wasm/kham_wasm.js';
await init();

// POS tagging is available through segment_fts()
// t.pos returns a string tag or null for OOV / non-Thai tokens

const tokens = segment_fts("นักเรียนกินข้าวกับปลา");
for (const t of tokens) {
  if (t.pos) console.log(t.text, '→', t.pos);
}
// นักเรียน → Noun
// กิน      → Verb
// ข้าว     → Noun
// กับ      → Preposition
// ปลา      → Noun

#include "kham.h"

// POS tagging is available through kham_fts_segment()
// t->pos is NULL for OOV / non-Thai tokens

KhamFtsTokenList *list = kham_fts_segment("นักเรียนกินข้าวกับปลา");
for (size_t i = 0; i < list->len; i++) {
    KhamFtsToken *t = &list->tokens[i];
    if (t->pos)
        printf("%-10s → %s\n", t->text, t->pos);
}
// นักเรียน → Noun
// กิน      → Verb
// ข้าว     → Noun
// กับ      → Preposition
// ปลา      → Noun
kham_fts_token_list_free(list);

NeTagger

docs.rs ↗

Gazetteer-based NER with three categories: Person, Place, Org. In Python, WASM, and C, NE tags are accessed via segment_fts() / kham_fts_segment().

use kham_core::ne::NeTagger;
use kham_core::{TokenKind, Tokenizer};

let ne = NeTagger::builtin();
println!("{:?}", ne.tag("กรุงเทพ")); // Some(Place)

// Post-process tokens from Tokenizer::segment
let tok = Tokenizer::new();
let src = "บริษัทไทยออยล์ก่อตั้งในกรุงเทพ";
let tokens = ne.tag_tokens(tok.segment(src), src);

for t in &tokens {
    if matches!(t.kind, TokenKind::Named(_)) {
        println!("{} → {:?}", t.text, t.kind);
    }
}

// Custom gazetteer from TSV: word<TAB>NE_TAG  (PERSON | PLACE | ORG)
let custom = NeTagger::from_tsv("แอนโทรปิก\tORG\n");

import kham

# NE tagging is available through segment_fts()
# t.ne returns "Person" | "Place" | "Org" or None

for t in kham.segment_fts("บริษัทไทยออยล์ก่อตั้งในกรุงเทพ"):
    if t.ne:
        print(f"{t.text:12} → {t.ne}")
# ไทยออยล์  → Org
# กรุงเทพ   → Place

import init, { segment_fts } from '/wasm/kham_wasm.js';
await init();

// NE tagging is available through segment_fts()
// t.ne returns "Person" | "Place" | "Org" or null

const tokens = segment_fts("บริษัทไทยออยล์ก่อตั้งในกรุงเทพ");
for (const t of tokens) {
  if (t.ne) console.log(t.text, '→', t.ne);
}
// ไทยออยล์  → Org
// กรุงเทพ   → Place

#include "kham.h"

// NE tagging is available through kham_fts_segment()
// t->ne is NULL if not in the NE gazetteer

KhamFtsTokenList *list = kham_fts_segment("บริษัทไทยออยล์ก่อตั้งในกรุงเทพ");
for (size_t i = 0; i < list->len; i++) {
    KhamFtsToken *t = &list->tokens[i];
    if (t->ne)
        printf("%-12s → %s\n", t->text, t->ne);
}
// ไทยออยล์  → Org
// กรุงเทพ   → Place
kham_fts_token_list_free(list);

RomanizationMap

docs.rs ↗

RTGS (Royal Thai General System) table-lookup romanization. Falls back to the original Thai text for out-of-vocabulary words.

use kham_core::romanizer::RomanizationMap;

let rom = RomanizationMap::builtin();

// Single word lookup
println!("{:?}", rom.romanize("กรุงเทพ"));   // Some("Krung Thep")
println!("{}", rom.romanize_or_raw("ปลา"));   // "pla"
println!("{}", rom.romanize_or_raw("zzz"));   // "zzz"  (OOV → passthrough)

// Batch lookup
let roman = rom.romanize_tokens(&["กรุงเทพ", "ประเทศ", "ไทย"]);
println!("{:?}", roman); // ["Krung Thep", "prathet", "Thai"]

// Whole sentence romanization
let sentence = rom.romanize_sentence("กินข้าวกับปลา 100 บาท");
println!("{sentence}"); // kinkhaokapla 100 bat

import kham

# romanize() returns list of RomanToken(text, roman)
for t in kham.romanize("กินข้าวกับปลา"):
    print(f"{t.text:6} → {t.roman}")
# กิน   → kin
# ข้าว  → khao
# กับ   → kap
# ปลา   → pla

# roman is also available on every FtsToken
for t in kham.segment_fts("กรุงเทพ"):
    print(t.text, t.roman)   # กรุงเทพ  Krung Thep

import init, { romanize } from '/wasm/kham_wasm.js';
await init();

// romanize() returns array of RomanToken objects (text, roman)
const pairs = romanize("กินข้าวกับปลา");
for (const t of pairs) {
  console.log(t.text, '→', t.roman);
}
// กิน   → kin
// ข้าว  → khao
// กับ   → kap
// ปลา   → pla

// roman is also available on every FtsToken from segment_fts()

#include "kham.h"

// kham_romanize returns KhamRomanTokenList
// Each KhamRomanToken has: text (char*), roman (char*)

KhamRomanTokenList *list = kham_romanize("กินข้าวกับปลา");
for (size_t i = 0; i < list->len; i++) {
    KhamRomanToken *t = &list->tokens[i];
    printf("%-6s → %s\n", t->text, t->roman);
}
// กิน   → kin
// ข้าว  → khao
// กับ   → kap
// ปลา   → pla
kham_roman_token_list_free(list);

// roman is also on every KhamFtsToken from kham_fts_segment()

number

docs.rs ↗

Thai numeral utilities: convert Thai digit characters (๐–๙) to ASCII, parse/generate Thai cardinal number words, and render Thai Baht currency text.

use kham_core::number::{
    thai_digits_to_ascii, parse_thai_word, u64_to_thai_word,
    parse_thai_baht, to_thai_baht_text,
};

// Thai digits → ASCII
assert_eq!(thai_digits_to_ascii("ราคา ๑๒๓ บาท"), "ราคา 123 บาท");

// Number word parsing
assert_eq!(parse_thai_word("หนึ่งร้อยยี่สิบสาม"), Some(123));
assert_eq!(parse_thai_word("สองล้าน"),             Some(2_000_000));
assert_eq!(parse_thai_word("กินข้าว"),             None); // not a number

// Number → Thai word
println!("{}", u64_to_thai_word(42));        // "สี่สิบสอง"
println!("{}", u64_to_thai_word(1_000_000)); // "หนึ่งล้าน"

// Baht text
println!("{}", to_thai_baht_text(1234, 50));
// "หนึ่งพันสองร้อยสามสิบสี่บาทห้าสิบสตางค์"
if let Some(amt) = parse_thai_baht("หนึ่งร้อยบาทถ้วน") {
    println!("{} baht {} satang", amt.baht, amt.satang); // 100 0
}

import kham

# Thai digits → ASCII
kham.thai_digits_to_ascii("ราคา ๑๒๓ บาท")   # "ราคา 123 บาท"

# Thai word → number (returns int or None)
kham.thai_word_to_number("หนึ่งร้อยยี่สิบสาม")  # 123
kham.thai_word_to_number("สองล้าน")              # 2000000
kham.thai_word_to_number("กินข้าว")              # None

# Number → Thai word (accepts full u64 range)
kham.number_to_thai_word(42)          # "สี่สิบสอง"
kham.number_to_thai_word(1_000_000)   # "หนึ่งล้าน"

# Baht text
kham.number_to_baht_text(1234, 50)
# "หนึ่งพันสองร้อยสามสิบสี่บาทห้าสิบสตางค์"

amt = kham.parse_baht_text("หนึ่งร้อยบาทถ้วน")  # BahtAmount or None
if amt:
    print(amt.baht, amt.satang)   # 100  0

import init, {
  thai_digits_to_ascii, thai_word_to_number, number_to_thai_word,
  number_to_baht_text, parse_baht_text,
} from '/wasm/kham_wasm.js';
await init();

// Thai digits → ASCII
thai_digits_to_ascii("ราคา ๑๒๓ บาท");   // "ราคา 123 บาท"

// Thai word → number (returns decimal string; "" if not a number)
thai_word_to_number("หนึ่งร้อยยี่สิบสาม");  // "123"
thai_word_to_number("กินข้าว");              // ""  (not a number)

// Number → Thai word (u64 as BigInt — up to 9,007,199,254,740,991 safely)
number_to_thai_word(42n);           // "สี่สิบสอง"
number_to_thai_word(10_000_000_000n); // "หนึ่งหมื่นล้าน"

// Baht text (baht is BigInt)
number_to_baht_text(1234n, 50);
// "หนึ่งพันสองร้อยสามสิบสี่บาทห้าสิบสตางค์"

const r = parse_baht_text("หนึ่งร้อยบาทถ้วน");
if (r.valid) console.log(r.baht, r.satang);  // 100n  0

#include "kham.h"

// Thai digits → ASCII (free with kham_string_free)
char *s = kham_thai_digits_to_ascii("ราคา ๑๒๓ บาท");
printf("%s\n", s);   // "ราคา 123 บาท"
kham_string_free(s);

// Thai word → number
uint64_t n;
if (kham_thai_word_to_number("หนึ่งร้อยยี่สิบสาม", &n))
    printf("%llu\n", (unsigned long long)n);  // 123

// Number → Thai word (free with kham_string_free)
char *word = kham_number_to_thai_word(1000000);
printf("%s\n", word);   // "หนึ่งล้าน"
kham_string_free(word);

// Baht text (free with kham_string_free)
char *baht = kham_number_to_baht_text(1234, 50);
printf("%s\n", baht);
kham_string_free(baht);

// Parse baht (free with kham_baht_amount_free)
KhamBahtAmount *amt = kham_parse_baht_text("หนึ่งร้อยบาทถ้วน");
if (amt) {
    printf("%llu baht %u satang\n",
           (unsigned long long)amt->baht, amt->satang);
    kham_baht_amount_free(amt);
}

sentence

docs.rs ↗

Sentence boundary detection. Splits on newlines, Thai markers (ฯ ๚ ๛), and Western punctuation (! ? . followed by space). Each sentence span carries char offsets for Python/JS string slicing.

use kham_core::sentence::split_sentences;

let text = "คุณชอบอาหารไทยไหม? ผมชอบต้มยำกุ้ง!\nอาหารไทยรสเผ็ด";
let sents = split_sentences(text);

for (i, s) in sents.iter().enumerate() {
    println!("S{i}: {:?}  chars={}..{}", s.text, s.char_span.start, s.char_span.end);
}
// S0: "คุณชอบอาหารไทยไหม?"     chars=0..19
// S1: " ผมชอบต้มยำกุ้ง!"       chars=19..36
// S2: "\nอาหารไทยรสเผ็ด"       chars=36..50

import kham

# split_sentences() returns list of Sentence(text, char_start, char_end)
text = "คุณชอบอาหารไทยไหม? ผมชอบต้มยำกุ้ง!\nอาหารไทยรสเผ็ด"
for i, s in enumerate(kham.split_sentences(text)):
    print(f"S{i}: {s.text!r}  chars={s.char_start}..{s.char_end}")

# Reconstruct original text from spans:
reconstructed = "".join(s.text for s in kham.split_sentences(text))
assert reconstructed == text

import init, { split_sentences } from '/wasm/kham_wasm.js';
await init();

// split_sentences() returns array of Sentence objects (text, char_start, char_end)
const text = "คุณชอบอาหารไทยไหม? ผมชอบต้มยำกุ้ง!\nอาหารไทยรสเผ็ด";
const sents = split_sentences(text);

for (const s of sents) {
  console.log(s.text, s.char_start, s.char_end);
}

// Slice original string using char offsets
// (use [...text] spread to get Unicode scalar values)
const chars = [...text];
for (const s of sents) {
  console.log(chars.slice(s.char_start, s.char_end).join(''));
}

#include "kham.h"

// KhamSentence fields: text (char*), char_start, char_end (size_t)

const char *text = "คุณชอบอาหารไทยไหม? ผมชอบต้มยำกุ้ง!\nอาหารไทยรสเผ็ด";
KhamSentenceList *list = kham_split_sentences(text);

for (size_t i = 0; i < list->len; i++) {
    KhamSentence *s = &list->sentences[i];
    printf("S%zu: %s  (chars %zu..%zu)\n",
           i, s->text, s->char_start, s->char_end);
}
// S0: คุณชอบอาหารไทยไหม?  (chars 0..19)
// S1:  ผมชอบต้มยำกุ้ง!   (chars 19..36)
// S2: <newline>อาหารไทยรสเผ็ด  (chars 36..50)
kham_sentence_list_free(list);

soundex

docs.rs ↗

Thai phonetic encoding: lk82 (12 groups, 4-char), udom83 (14 groups, 4-char), MetaSound (3 chars/syllable). Plus a Thai–English cross-language algorithm (Suwanvisat & Prasitjutrakul 1998) for transliterated name search.

use kham_core::soundex::{
    soundex, sounds_like, SoundexAlgorithm,
    thai_english_soundex, sounds_like_cross_lang,
};

// Thai soundex
println!("{}", soundex("กาน", SoundexAlgorithm::Lk82));      // "1600"
println!("{}", soundex("กาน", SoundexAlgorithm::Udom83));    // "1900"
println!("{}", soundex("กาน", SoundexAlgorithm::MetaSound)); // "112"

// Similarity check
assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::Lk82));  // same group
assert!(!sounds_like("ลาน", "ราน", SoundexAlgorithm::Udom83)); // ล/ร split

// Thai–English cross-language
println!("{}", thai_english_soundex("Somchai")); // same as thai_english_soundex("สมชาย")
assert!(sounds_like_cross_lang("สมชาย", "Somchai")); // true

import kham

# Thai soundex  (algo defaults to "lk82")
kham.soundex_word("กาน")              # "1600"  (lk82)
kham.soundex_word("กาน", "udom83")    # "1900"
kham.soundex_word("กาน", "metasound") # "112"

# Similarity check
kham.sounds_like("กาน", "ขาน")            # True  (same lk82 group)
kham.sounds_like("ลาน", "ราน", "udom83")  # False (ล/ร split in udom83)

# Thai–English cross-language
kham.thai_english_soundex("Somchai")       # same code as "สมชาย"
kham.sounds_like_cross_lang("สมชาย", "Somchai")   # True
kham.sounds_like_cross_lang("Robert",  "Rupert")   # True

import init, {
  soundex_word, sounds_like, thai_english_soundex, sounds_like_cross_lang,
} from '/wasm/kham_wasm.js';
await init();

// Thai soundex  (algo defaults to "lk82")
soundex_word("กาน");               // "1600"  (lk82)
soundex_word("กาน", "udom83");     // "1900"
soundex_word("กาน", "metasound");  // "112"

// Similarity check
sounds_like("กาน", "ขาน");             // true  (same lk82 group)
sounds_like("ลาน", "ราน", "udom83");   // false (ล/ร split in udom83)

// Thai–English cross-language
thai_english_soundex("Somchai");        // same code as "สมชาย"
sounds_like_cross_lang("สมชาย", "Somchai");   // true
sounds_like_cross_lang("Robert",  "Rupert");   // true

#include "kham.h"

// kham_soundex — algo: "lk82" (default/NULL), "udom83", "metasound"
// All string results freed with kham_string_free

char *code = kham_soundex("กาน", NULL);       // "1600"  (lk82)
kham_string_free(code);
code = kham_soundex("กาน", "udom83");          // "1900"
kham_string_free(code);
code = kham_soundex("กาน", "metasound");       // "112"
kham_string_free(code);

// Similarity check (no allocation)
bool alike = kham_sounds_like("กาน", "ขาน", NULL);     // true  (lk82)
bool split = kham_sounds_like("ลาน", "ราน", "udom83"); // false (ล/ร split)

// Thai–English cross-language
char *eng = kham_thai_english_soundex("Somchai");
char *tha = kham_thai_english_soundex("สมชาย");
printf("match=%d\n", strcmp(eng, tha) == 0);  // 1
kham_string_free(eng);
kham_string_free(tha);

bool cross = kham_sounds_like_cross_lang("สมชาย", "Somchai");  // true

SpellChecker

docs.rs ↗

Spelling correction over the built-in 62k-word dictionary. Candidates within Levenshtein edit distance ≤ 2 are returned, ranked by lk82 phonetic similarity, then edit distance, then TNC corpus frequency. Accepts single Thai words — segment first for multi-word input.

use kham_core::spell::SpellChecker;

// Reuse the checker — builtin() loads the TNC frequency map once
let checker = SpellChecker::builtin();

let suggs = checker.suggestions("กีนข้าว", 5);
for s in &suggs {
    println!("{:12} edit={} soundex={} freq={}",
        s.word, s.edit_distance, s.soundex_match, s.freq_score);
}
// กินข้าว  edit=1  soundex=true  freq=…

// Correctly spelled words appear with edit_distance = 0
let exact = checker.suggestions("กิน", 1);
assert_eq!(exact[0].word, "กิน");
assert_eq!(exact[0].edit_distance, 0);

// Suggestion fields:
// s.word          — String   candidate word from the dictionary
// s.edit_distance — u8       Levenshtein distance (0–2)
// s.soundex_match — bool     lk82 codes match
// s.freq_score    — u32      TNC corpus frequency (0 if not in table)

// Single best correction
let checker = SpellChecker::builtin();
if let Some(corrected) = checker.did_you_mean("กีนข้าว") {
    println!("Did you mean: {corrected}");  // กินข้าว
}
// Correct whole text
let text = "กีนข้าวกับปลา";
let out = checker.correct_text(text);
println!("{out}");

import kham

# spell_suggestions(word, max_n) → list[SpellSuggestion]
suggs = kham.spell_suggestions("กีนข้าว", 5)
for s in suggs:
    print(f"{s.word:12} edit={s.edit_distance} soundex={s.soundex_match} freq={s.freq_score}")
# กินข้าว  edit=1  soundex=True  freq=…

# SpellSuggestion fields:
# s.word          — str   candidate word
# s.edit_distance — int   Levenshtein distance (0–2)
# s.soundex_match — bool  lk82 codes match
# s.freq_score    — int   TNC corpus frequency (0 if absent)

# Correct word → edit_distance 0
top = kham.spell_suggestions("กิน", 1)
assert top[0].word == "กิน" and top[0].edit_distance == 0

import init, { spell_suggestions } from '/wasm/kham_wasm.js';
await init();

// spell_suggestions(word, maxN) → SpellSuggestion[]
const suggs = spell_suggestions("กีนข้าว", 5);
for (const s of suggs) {
  console.log(s.word, 'edit:', s.edit_distance,
              'soundex:', s.soundex_match, 'freq:', s.freq_score);
}
// กินข้าว  edit: 1  soundex: true  freq: …

// SpellSuggestion properties:
// s.word          — string   candidate word
// s.edit_distance — number   Levenshtein distance (0–2)
// s.soundex_match — boolean  lk82 codes match
// s.freq_score    — number   TNC corpus frequency

#include "kham.h"

// kham_spell_suggestions(word, max_n) → KhamSpellList*
// Free with kham_spell_list_free()

KhamSpellList *list = kham_spell_suggestions("กีนข้าว", 5);
for (size_t i = 0; i < list->len; i++) {
    KhamSpellSuggestion *s = &list->suggestions[i];
    printf("%-12s  edit=%u  soundex=%d  freq=%u\n",
           s->word, s->edit_distance, s->soundex_match, s->freq_score);
}
kham_spell_list_free(list);

// KhamSpellSuggestion fields:
//   word          — char*   candidate word (heap-allocated)
//   edit_distance — uint8_t Levenshtein distance (0–2)
//   soundex_match — bool    lk82 codes match
//   freq_score    — uint32_t TNC corpus frequency

Note: SpellChecker expects a single word. For text with multiple words, segment first with Tokenizer::segment() and check each Thai token individually.

KeyExtractor

docs.rs ↗

Unsupervised keyword extraction using TF × inverse-corpus-frequency scoring. Words rare in the TNC corpus score higher than common function words. Stopwords and single-character tokens are always excluded. Results are sorted by score descending.

use kham_core::keyword::KeyExtractor;

// Reuse the extractor — builtin() loads TNC freq + stopwords once
let extractor = KeyExtractor::builtin();

let text = "นักวิทยาศาสตร์ค้นพบดาวเคราะห์ใหม่ในระบบสุริยะ              ดาวดวงนี้โคจรอยู่ใกล้ดาวเคราะห์น้อย";

let keywords = extractor.extract(text, 5);
for kw in &keywords {
    println!("{:12} score={:.4} count={}", kw.word, kw.score, kw.count);
}

// Keyword fields:
// kw.word  — String  the keyword text
// kw.score — f32     TF × (max_freq+1) / (corpus_freq+1)
// kw.count — usize   raw occurrence count in the document

let extractor = KeyExtractor::builtin();
let text = "นักพัฒนาซอฟต์แวร์เขียนโค้ดทุกวัน นักพัฒนาซอฟต์แวร์ใช้ภาษาต่าง ๆ";
let phrases = extractor.extract_phrases(text, 5);
for p in &phrases {
    println!("{:20} score={:.4} count={}", p.word, p.score, p.count);
}

import kham

# extract_keywords(text, max_n) → list[Keyword]
text = ("นักวิทยาศาสตร์ค้นพบดาวเคราะห์ใหม่ในระบบสุริยะ "
        "ดาวดวงนี้โคจรอยู่ใกล้ดาวเคราะห์น้อย")

keywords = kham.extract_keywords(text, 5)
for kw in keywords:
    print(f"{kw.word:12} score={kw.score:.4f} count={kw.count}")

# Keyword fields:
# kw.word  — str    the keyword text
# kw.score — float  relevance score (TF × IDF_proxy)
# kw.count — int    raw occurrence count

import init, { extract_keywords } from '/wasm/kham_wasm.js';
await init();

// extract_keywords(text, maxN) → Keyword[]
const text = "นักวิทยาศาสตร์ค้นพบดาวเคราะห์ใหม่ในระบบสุริยะ " +
             "ดาวดวงนี้โคจรอยู่ใกล้ดาวเคราะห์น้อย";

const keywords = extract_keywords(text, 5);
for (const kw of keywords) {
  console.log(kw.word, 'score:', kw.score.toFixed(4), 'count:', kw.count);
}

// Keyword properties:
// kw.word  — string  the keyword text
// kw.score — number  relevance score (TF × IDF_proxy)
// kw.count — number  raw occurrence count

#include "kham.h"

// kham_keywords(text, max_n) → KhamKeywordList*
// Free with kham_keyword_list_free()

const char *text =
    "นักวิทยาศาสตร์ค้นพบดาวเคราะห์ใหม่ในระบบสุริยะ "
    "ดาวดวงนี้โคจรอยู่ใกล้ดาวเคราะห์น้อย";

KhamKeywordList *list = kham_keywords(text, 5);
for (size_t i = 0; i < list->len; i++) {
    KhamKeyword *kw = &list->keywords[i];
    printf("%-12s  score=%.4f  count=%zu\n",
           kw->word, kw->score, kw->count);
}
kham_keyword_list_free(list);

// KhamKeyword fields:
//   word  — char*   keyword text (heap-allocated)
//   score — float   TF × IDF_proxy relevance score
//   count — size_t  raw occurrence count

← Getting Started Full rustdoc on docs.rs ↗