Getting Started
kham v0.5.0 — pick your target and be up and running in minutes.
🦀
Rust
Native crate — zero-copy, no_std core
Install
[dependencies]
kham-core = "0.5" Usage
use kham_core::Segmenter;
fn main() {
let seg = Segmenter::new();
// Segment into &str slices (zero-copy)
let tokens = seg.segment("กินข้าวกับปลา");
println!("{:?}", tokens);
// ["กิน", "ข้าว", "กับ", "ปลา"]
// Rich tokens with kind + byte/char spans
for tok in seg.segment_tokens("Hello กรุงเทพ 2024") {
println!("{:?} kind={:?} chars={}..{}", tok.text, tok.kind, tok.char_span.start, tok.char_span.end);
}
} 🐍
Python
PyO3 bindings — segment() and segment_tokens()
Install
pip install kham Usage
import kham
# Segment into a list of strings
tokens = kham.segment("กินข้าวกับปลา")
print(tokens)
# ['กิน', 'ข้าว', 'กับ', 'ปลา']
# Rich Token objects
for tok in kham.segment_tokens("Hello กรุงเทพ 2024"):
print(tok.text, tok.kind, tok.char_start, tok.char_end) 🌐
WebAssembly / npm
Runs in browser and Node.js — no server needed
Install
npm install kham-wasm Usage
import init, { segment, segment_tokens } from 'kham-wasm';
// Initialise once (fetches the .wasm file)
await init();
const words = segment("กินข้าวกับปลา");
console.log(words);
// ["กิน", "ข้าว", "กับ", "ปลา"]
const tokens = segment_tokens("Hello กรุงเทพ 2024");
tokens.forEach(t => console.log(t.text, t.kind, t.char_start, t.char_end)); 💻
CLI
Command-line Thai segmenter
Install
cargo install kham-cli Usage
# Segment Thai text
kham "กินข้าวกับปลา"
# กิน|ข้าว|กับ|ปลา
# FTS mode — kind, POS, NE, stop, syn per token
kham --fts "กินข้าวกับปลา"
# FTS + phonetic code in the syn= field
kham --fts --soundex lk82 "กินข้าวกับปลา" 🐘
PostgreSQL FTS
Full-text search parser extension for PostgreSQL 14+
Install
# Build and install (requires pg_config in PATH)
cargo build -p kham-pg --release
make -C kham-pg install Usage
-- Load extension
LOAD 'kham';
-- Create a text search configuration
CREATE TEXT SEARCH CONFIGURATION kham_cfg (PARSER = kham);
ALTER TEXT SEARCH CONFIGURATION kham_cfg
ADD MAPPING FOR thai WITH simple;
-- Index and search
CREATE TABLE docs (id SERIAL, body TEXT, tsv TSVECTOR);
UPDATE docs SET tsv = to_tsvector('kham_cfg', body);
CREATE INDEX docs_tsv ON docs USING GIN(tsv);
SELECT id, ts_headline('kham_cfg', body, query) AS snippet
FROM docs, to_tsquery('kham_cfg', 'ข้าว') query
WHERE tsv @@ query; 🗃️
SQLite FTS5
Loadable tokenizer extension for SQLite FTS5
Install
# Build (requires SQLite headers)
cargo build -p kham-sqlite --release
# macOS: brew install sqlite (system sqlite3 disables load_extension) Usage
.load ./target/release/libkham_sqlite
CREATE VIRTUAL TABLE docs USING fts5(
body,
tokenize = 'kham'
);
INSERT INTO docs VALUES ('กินข้าวกับปลา');
INSERT INTO docs VALUES ('กรุงเทพมหานครเป็นเมืองหลวง');
SELECT * FROM docs WHERE docs MATCH 'ปลา';