🐘
PostgreSQL FTS
kham-pg is a PostgreSQL text-search parser extension. It tokenises Thai text for full-text search, replacing the built-in parser for Thai-script documents.
1
Prerequisites
PostgreSQL 14 or later, Rust toolchain (1.85+), and pg_config in your PATH.
pg_config --version # PostgreSQL 14 or later
rustup target add x86_64-unknown-linux-gnu # if cross-compiling 2
Build and install
# From the kham monorepo root
cargo build -p kham-pg --release
make -C kham-pg install
# Verify the .so is in place
ls "$(pg_config --pkglibdir)"/kham* 3
Load and configure
Load the parser, create a text search configuration, and add mappings for Thai and other token types.
-- Load the shared library
LOAD 'kham';
-- Create a text search parser backed by kham
CREATE TEXT SEARCH CONFIGURATION kham_cfg (PARSER = kham);
-- Map token types to dictionaries
ALTER TEXT SEARCH CONFIGURATION kham_cfg
ADD MAPPING FOR thai WITH simple;
ALTER TEXT SEARCH CONFIGURATION kham_cfg
ADD MAPPING FOR latin WITH english_stem;
ALTER TEXT SEARCH CONFIGURATION kham_cfg
ADD MAPPING FOR integer WITH simple; 4
Index documents
CREATE TABLE articles (
id SERIAL PRIMARY KEY,
title TEXT,
body TEXT,
tsv TSVECTOR
);
-- Populate the tsvector column
UPDATE articles
SET tsv = to_tsvector('kham_cfg', coalesce(title, '') || ' ' || coalesce(body, ''));
-- GIN index for fast search
CREATE INDEX articles_tsv_idx ON articles USING GIN(tsv);
-- Keep it up-to-date automatically
CREATE TRIGGER articles_tsv_update
BEFORE INSERT OR UPDATE ON articles
FOR EACH ROW EXECUTE FUNCTION
tsvector_update_trigger(tsv, 'public.kham_cfg', title, body); 5
Search with ts_headline
SELECT
id,
ts_headline(
'kham_cfg',
body,
query,
'StartSel=<mark>, StopSel=</mark>, MaxWords=20'
) AS snippet
FROM articles,
to_tsquery('kham_cfg', 'ข้าว & ปลา') AS query
WHERE tsv @@ query
ORDER BY ts_rank(tsv, query) DESC
LIMIT 10;