PostgreSQL FTS
kham-pg is a PostgreSQL text-search parser extension. It tokenises Thai text for full-text search, replacing the built-in parser for Thai-script documents. Each Thai token expands to up to six colocated lexemes: the word itself, lk82 soundex code, RTGS romanization, ASCII number form, and a POS tag. Thai stopwords are suppressed automatically.
Docker Hub (fastest start)
Pull a ready-to-use image — no Rust toolchain, no compiler, no manual install. Multi-arch: amd64 + arm64.
# PostgreSQL 17 with kham_pg pre-installed (latest = PG 17)
docker run --rm -e POSTGRES_PASSWORD=secret \
-p 5432:5432 nickmsft/kham-pg:latest
# Specific PostgreSQL version — available tags:
# v0.8.2-pg14 v0.8.2-pg16 v0.8.2-pg17 v0.8.2-pg18
docker run --rm -e POSTGRES_PASSWORD=secret \
-p 5432:5432 nickmsft/kham-pg:v0.8.2-pg17
# Connect and use immediately
psql -h localhost -U postgres -c "
CREATE EXTENSION kham_pg;
SELECT to_tsvector('kham', 'กินข้าวกับปลา');
" Prerequisites
PostgreSQL 14 or later, Rust toolchain (1.85+), and pg_config in your PATH.
pg_config --version # PostgreSQL 14 or later
rustup target add x86_64-unknown-linux-gnu # if cross-compiling Build and install
# From the kham monorepo root
cargo build -p kham-pg --release
make -C kham-pg install
# Verify the .so is in place
ls "$(pg_config --pkglibdir)"/kham* Load and use
The extension ships with a ready-to-use kham configuration. Thai stopwords are suppressed automatically; no extra configuration needed.
-- Load the extension
CREATE EXTENSION kham_pg;
-- Full-text search with the built-in kham configuration
SELECT to_tsvector('kham', 'กินข้าวกับปลา') @@ plainto_tsquery('kham', 'ปลา');
-- t (กับ is a stopword and is suppressed; ปลา is indexed) Index documents
CREATE TABLE articles (
id SERIAL PRIMARY KEY,
title TEXT,
body TEXT,
tsv TSVECTOR
);
-- Populate the tsvector column
UPDATE articles
SET tsv = to_tsvector('kham', coalesce(title, '') || ' ' || coalesce(body, ''));
-- GIN index for fast search
CREATE INDEX articles_tsv_idx ON articles USING GIN(tsv);
-- Keep it up-to-date automatically
CREATE TRIGGER articles_tsv_update
BEFORE INSERT OR UPDATE ON articles
FOR EACH ROW EXECUTE FUNCTION
tsvector_update_trigger(tsv, 'public.kham', title, body); Search with ts_headline
SELECT
id,
ts_headline(
'kham',
body,
query,
'StartSel=<mark>, StopSel=</mark>, MaxWords=20'
) AS snippet
FROM articles,
to_tsquery('kham', 'ข้าว & ปลา') AS query
WHERE tsv @@ query
ORDER BY ts_rank(tsv, query) DESC
LIMIT 10; Phonetic and romanization search
Every Thai token is expanded with its lk82 soundex code and RTGS romanization. Near-homophones and Latin-script queries match automatically.
-- Latin-script query matches Thai document (RTGS romanization)
SELECT to_tsvector('kham', 'กินข้าวกับปลา') @@ plainto_tsquery('kham', 'pla');
-- t (ปลา → 'pla' stored as colocated lexeme)
-- Phonetic query using the lk82 code for ปลา (4800)
SELECT title FROM articles
WHERE tsv @@ to_tsquery('kham', '4800'); Thai number normalization
Thai digit strings are indexed alongside their ASCII equivalent, enabling cross-script numeric queries.
-- ๑๒๓ is stored as both ๑๒๓ and 123 in the same tsvector position
SELECT to_tsvector('kham', '๑๒๓') @@ plainto_tsquery('kham', '123') AS found;
-- t
-- Search documents containing ๑๒๓ using ASCII digits
SELECT title FROM articles WHERE tsv @@ plainto_tsquery('kham', '123'); POS lexeme filtering
Tokens with a known part of speech emit a colocated pos_<tag> lexeme. Use the ::tsquery cast so the underscore is treated as part of the lexeme.
-- Find documents containing a verb
SELECT title FROM articles WHERE tsv @@ 'pos_verb'::tsquery;
-- Combine POS filter with a content query
SELECT title FROM articles
WHERE tsv @@ ('pos_noun'::tsquery && plainto_tsquery('kham', 'กิน'));
-- Inspect which POS tags are stored for a word
SELECT lexeme FROM unnest(to_tsvector('kham', 'ปลา'))
WHERE lexeme LIKE 'pos_%';
-- pos_noun Alternative soundex dictionaries
Two additional dictionary variants are available for finer phonetic discrimination. Create a custom configuration to use them.
-- udom83 soundex — finer sibilant and liquid distinctions
CREATE TEXT SEARCH CONFIGURATION kham_udom83 (PARSER = kham);
ALTER TEXT SEARCH CONFIGURATION kham_udom83
ADD MAPPING FOR thai, named WITH kham_fts_dict_udom83;
ALTER TEXT SEARCH CONFIGURATION kham_udom83
ADD MAPPING FOR latin, number, unknown WITH kham_dict;
-- MetaSound — per-syllable encoding, most discriminating
CREATE TEXT SEARCH CONFIGURATION kham_metasound (PARSER = kham);
ALTER TEXT SEARCH CONFIGURATION kham_metasound
ADD MAPPING FOR thai, named WITH kham_fts_dict_metasound;
ALTER TEXT SEARCH CONFIGURATION kham_metasound
ADD MAPPING FOR latin, number, unknown WITH kham_dict; Ranking and field boosting
Use ts_rank() or ts_rank_cd() to order results by relevance. Boost individual fields with setweight() — weights A (1.0) > B (0.4) > C (0.2) > D (0.1).
-- Basic rank (higher = more relevant)
SELECT id, ts_rank(tsv, query) AS rank
FROM articles, to_tsquery('kham', 'ข้าว') query
WHERE tsv @@ query
ORDER BY rank DESC;
-- Cover-density rank — penalises scattered token matches, better for Thai
SELECT id, ts_rank_cd(tsv, query) AS rank
FROM articles, to_tsquery('kham', 'ข้าว') query
WHERE tsv @@ query
ORDER BY rank DESC;
-- Field boosting: build tsvector with per-field weights
-- Weight A tokens score ~5× higher than weight C
UPDATE articles
SET tsv =
setweight(to_tsvector('kham', coalesce(title, '')), 'A') ||
setweight(to_tsvector('kham', coalesce(body, '')), 'C');