pg_similarity

pg_similarity

pg_similarity : support similarity queries

Overview

ID Extension Package Version Category License Language
1840
pg_similarity
pg_similarity
1.0
RAG
BSD 3-Clause
C
Attribute Has Binary Has Library Need Load Has DDL Relocatable Trusted
--s-d-r
No
Yes
No
Yes
yes
no
Relationships
See Also
vector
smlar
fuzzystrmatch
pg_trgm
vchord
pg_bigm
citext
unaccent

Packages

Type Repo Version PG Major Compatibility Package Pattern Dependencies
EXT
MIXED
1.0
18
17
16
15
14
pg_similarity -
RPM
PIGSTY
1.0
18
17
16
15
14
pg_similarity_$v -
DEB
PGDG
1.0
18
17
16
15
14
postgresql-$v-similarity -
Linux / PG PG18 PG17 PG16 PG15 PG14
el8.x86_64
PGDG 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
el8.aarch64
PGDG 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
el9.x86_64
PGDG 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
el9.aarch64
PGDG 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
PIGSTY 1.0
el10.x86_64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
el10.aarch64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
d12.x86_64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
d12.aarch64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
d13.x86_64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
d13.aarch64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
u22.x86_64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
u22.aarch64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
u24.x86_64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
u24.aarch64
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
PGDG 1.0
Package Version OS ORG SIZE File URL
pg_similarity_18 1.0 el8.x86_64 pgdg 43.2 KiB pg_similarity_18-1.0-3PGDG.rhel8.x86_64.rpm
pg_similarity_18 1.0 el8.x86_64 pigsty 44.9 KiB pg_similarity_18-1.0-2PIGSTY.el8.x86_64.rpm
pg_similarity_18 1.0 el8.aarch64 pgdg 40.7 KiB pg_similarity_18-1.0-3PGDG.rhel8.aarch64.rpm
pg_similarity_18 1.0 el8.aarch64 pigsty 43.0 KiB pg_similarity_18-1.0-2PIGSTY.el8.aarch64.rpm
pg_similarity_18 1.0 el9.x86_64 pgdg 42.6 KiB pg_similarity_18-1.0-3PGDG.rhel9.x86_64.rpm
pg_similarity_18 1.0 el9.x86_64 pigsty 41.9 KiB pg_similarity_18-1.0-2PIGSTY.el9.x86_64.rpm
pg_similarity_18 1.0 el9.aarch64 pgdg 41.2 KiB pg_similarity_18-1.0-3PGDG.rhel9.aarch64.rpm
pg_similarity_18 1.0 el9.aarch64 pigsty 40.6 KiB pg_similarity_18-1.0-2PIGSTY.el9.aarch64.rpm
pg_similarity_18 1.0 el10.x86_64 pgdg 43.6 KiB pg_similarity_18-1.0-3PGDG.rhel10.x86_64.rpm
pg_similarity_18 1.0 el10.x86_64 pigsty 42.3 KiB pg_similarity_18-1.0-2PIGSTY.el10.x86_64.rpm
pg_similarity_18 1.0 el10.aarch64 pgdg 42.3 KiB pg_similarity_18-1.0-3PGDG.rhel10.aarch64.rpm
pg_similarity_18 1.0 el10.aarch64 pigsty 41.0 KiB pg_similarity_18-1.0-2PIGSTY.el10.aarch64.rpm
postgresql-18-similarity 1.0 d12.x86_64 pgdg 98.6 KiB postgresql-18-similarity_1.0-9.pgdg12+1_amd64.deb
postgresql-18-similarity 1.0 d12.aarch64 pgdg 96.2 KiB postgresql-18-similarity_1.0-9.pgdg12+1_arm64.deb
postgresql-18-similarity 1.0 d13.x86_64 pgdg 98.5 KiB postgresql-18-similarity_1.0-9.pgdg13+1_amd64.deb
postgresql-18-similarity 1.0 d13.aarch64 pgdg 96.2 KiB postgresql-18-similarity_1.0-9.pgdg13+1_arm64.deb
postgresql-18-similarity 1.0 u22.x86_64 pgdg 98.4 KiB postgresql-18-similarity_1.0-9.pgdg22.04+1_amd64.deb
postgresql-18-similarity 1.0 u22.aarch64 pgdg 96.2 KiB postgresql-18-similarity_1.0-9.pgdg22.04+1_arm64.deb
postgresql-18-similarity 1.0 u24.x86_64 pgdg 97.5 KiB postgresql-18-similarity_1.0-9.pgdg24.04+1_amd64.deb
postgresql-18-similarity 1.0 u24.aarch64 pgdg 94.9 KiB postgresql-18-similarity_1.0-9.pgdg24.04+1_arm64.deb
Package Version OS ORG SIZE File URL
pg_similarity_17 1.0 el8.x86_64 pigsty 44.8 KiB pg_similarity_17-1.0-2PIGSTY.el8.x86_64.rpm
pg_similarity_17 1.0 el8.aarch64 pigsty 42.9 KiB pg_similarity_17-1.0-2PIGSTY.el8.aarch64.rpm
pg_similarity_17 1.0 el9.x86_64 pigsty 42.0 KiB pg_similarity_17-1.0-2PIGSTY.el9.x86_64.rpm
pg_similarity_17 1.0 el9.aarch64 pigsty 40.6 KiB pg_similarity_17-1.0-2PIGSTY.el9.aarch64.rpm
pg_similarity_17 1.0 el10.x86_64 pgdg 43.6 KiB pg_similarity_17-1.0-3PGDG.rhel10.x86_64.rpm
pg_similarity_17 1.0 el10.x86_64 pigsty 42.3 KiB pg_similarity_17-1.0-2PIGSTY.el10.x86_64.rpm
pg_similarity_17 1.0 el10.aarch64 pgdg 42.3 KiB pg_similarity_17-1.0-3PGDG.rhel10.aarch64.rpm
pg_similarity_17 1.0 el10.aarch64 pigsty 41.0 KiB pg_similarity_17-1.0-2PIGSTY.el10.aarch64.rpm
postgresql-17-similarity 1.0 d12.x86_64 pgdg 98.7 KiB postgresql-17-similarity_1.0-9.pgdg12+1_amd64.deb
postgresql-17-similarity 1.0 d12.aarch64 pgdg 96.1 KiB postgresql-17-similarity_1.0-9.pgdg12+1_arm64.deb
postgresql-17-similarity 1.0 d13.x86_64 pgdg 98.8 KiB postgresql-17-similarity_1.0-9.pgdg13+1_amd64.deb
postgresql-17-similarity 1.0 d13.aarch64 pgdg 96.0 KiB postgresql-17-similarity_1.0-9.pgdg13+1_arm64.deb
postgresql-17-similarity 1.0 u22.x86_64 pgdg 103.7 KiB postgresql-17-similarity_1.0-9.pgdg22.04+1_amd64.deb
postgresql-17-similarity 1.0 u22.aarch64 pgdg 101.5 KiB postgresql-17-similarity_1.0-9.pgdg22.04+1_arm64.deb
postgresql-17-similarity 1.0 u24.x86_64 pgdg 97.6 KiB postgresql-17-similarity_1.0-9.pgdg24.04+1_amd64.deb
postgresql-17-similarity 1.0 u24.aarch64 pgdg 95.0 KiB postgresql-17-similarity_1.0-9.pgdg24.04+1_arm64.deb
Package Version OS ORG SIZE File URL
pg_similarity_16 1.0 el8.x86_64 pigsty 44.8 KiB pg_similarity_16-1.0-2PIGSTY.el8.x86_64.rpm
pg_similarity_16 1.0 el8.aarch64 pigsty 43.0 KiB pg_similarity_16-1.0-2PIGSTY.el8.aarch64.rpm
pg_similarity_16 1.0 el9.x86_64 pigsty 41.9 KiB pg_similarity_16-1.0-2PIGSTY.el9.x86_64.rpm
pg_similarity_16 1.0 el9.aarch64 pigsty 40.5 KiB pg_similarity_16-1.0-2PIGSTY.el9.aarch64.rpm
pg_similarity_16 1.0 el10.x86_64 pgdg 43.6 KiB pg_similarity_16-1.0-3PGDG.rhel10.x86_64.rpm
pg_similarity_16 1.0 el10.x86_64 pigsty 42.3 KiB pg_similarity_16-1.0-2PIGSTY.el10.x86_64.rpm
pg_similarity_16 1.0 el10.aarch64 pgdg 42.3 KiB pg_similarity_16-1.0-3PGDG.rhel10.aarch64.rpm
pg_similarity_16 1.0 el10.aarch64 pigsty 41.0 KiB pg_similarity_16-1.0-2PIGSTY.el10.aarch64.rpm
postgresql-16-similarity 1.0 d12.x86_64 pgdg 98.7 KiB postgresql-16-similarity_1.0-9.pgdg12+1_amd64.deb
postgresql-16-similarity 1.0 d12.aarch64 pgdg 96.1 KiB postgresql-16-similarity_1.0-9.pgdg12+1_arm64.deb
postgresql-16-similarity 1.0 d13.x86_64 pgdg 98.6 KiB postgresql-16-similarity_1.0-9.pgdg13+1_amd64.deb
postgresql-16-similarity 1.0 d13.aarch64 pgdg 96.2 KiB postgresql-16-similarity_1.0-9.pgdg13+1_arm64.deb
postgresql-16-similarity 1.0 u22.x86_64 pgdg 103.6 KiB postgresql-16-similarity_1.0-9.pgdg22.04+1_amd64.deb
postgresql-16-similarity 1.0 u22.aarch64 pgdg 101.5 KiB postgresql-16-similarity_1.0-9.pgdg22.04+1_arm64.deb
postgresql-16-similarity 1.0 u24.x86_64 pgdg 97.5 KiB postgresql-16-similarity_1.0-9.pgdg24.04+1_amd64.deb
postgresql-16-similarity 1.0 u24.aarch64 pgdg 95.1 KiB postgresql-16-similarity_1.0-9.pgdg24.04+1_arm64.deb
Package Version OS ORG SIZE File URL
pg_similarity_15 1.0 el8.x86_64 pigsty 45.9 KiB pg_similarity_15-1.0-2PIGSTY.el8.x86_64.rpm
pg_similarity_15 1.0 el8.aarch64 pigsty 43.9 KiB pg_similarity_15-1.0-2PIGSTY.el8.aarch64.rpm
pg_similarity_15 1.0 el9.x86_64 pigsty 44.1 KiB pg_similarity_15-1.0-2PIGSTY.el9.x86_64.rpm
pg_similarity_15 1.0 el9.aarch64 pigsty 42.5 KiB pg_similarity_15-1.0-2PIGSTY.el9.aarch64.rpm
pg_similarity_15 1.0 el10.x86_64 pgdg 45.8 KiB pg_similarity_15-1.0-3PGDG.rhel10.x86_64.rpm
pg_similarity_15 1.0 el10.x86_64 pigsty 44.5 KiB pg_similarity_15-1.0-2PIGSTY.el10.x86_64.rpm
pg_similarity_15 1.0 el10.aarch64 pgdg 44.2 KiB pg_similarity_15-1.0-3PGDG.rhel10.aarch64.rpm
pg_similarity_15 1.0 el10.aarch64 pigsty 43.0 KiB pg_similarity_15-1.0-2PIGSTY.el10.aarch64.rpm
postgresql-15-similarity 1.0 d12.x86_64 pgdg 99.5 KiB postgresql-15-similarity_1.0-9.pgdg12+1_amd64.deb
postgresql-15-similarity 1.0 d12.aarch64 pgdg 96.9 KiB postgresql-15-similarity_1.0-9.pgdg12+1_arm64.deb
postgresql-15-similarity 1.0 d13.x86_64 pgdg 99.7 KiB postgresql-15-similarity_1.0-9.pgdg13+1_amd64.deb
postgresql-15-similarity 1.0 d13.aarch64 pgdg 96.9 KiB postgresql-15-similarity_1.0-9.pgdg13+1_arm64.deb
postgresql-15-similarity 1.0 u22.x86_64 pgdg 105.4 KiB postgresql-15-similarity_1.0-9.pgdg22.04+1_amd64.deb
postgresql-15-similarity 1.0 u22.aarch64 pgdg 103.0 KiB postgresql-15-similarity_1.0-9.pgdg22.04+1_arm64.deb
postgresql-15-similarity 1.0 u24.x86_64 pgdg 99.3 KiB postgresql-15-similarity_1.0-9.pgdg24.04+1_amd64.deb
postgresql-15-similarity 1.0 u24.aarch64 pgdg 96.4 KiB postgresql-15-similarity_1.0-9.pgdg24.04+1_arm64.deb
Package Version OS ORG SIZE File URL
pg_similarity_14 1.0 el8.x86_64 pigsty 45.9 KiB pg_similarity_14-1.0-2PIGSTY.el8.x86_64.rpm
pg_similarity_14 1.0 el8.aarch64 pigsty 43.9 KiB pg_similarity_14-1.0-2PIGSTY.el8.aarch64.rpm
pg_similarity_14 1.0 el9.x86_64 pigsty 44.2 KiB pg_similarity_14-1.0-2PIGSTY.el9.x86_64.rpm
pg_similarity_14 1.0 el9.aarch64 pigsty 42.5 KiB pg_similarity_14-1.0-2PIGSTY.el9.aarch64.rpm
pg_similarity_14 1.0 el10.x86_64 pgdg 45.7 KiB pg_similarity_14-1.0-3PGDG.rhel10.x86_64.rpm
pg_similarity_14 1.0 el10.x86_64 pigsty 44.5 KiB pg_similarity_14-1.0-2PIGSTY.el10.x86_64.rpm
pg_similarity_14 1.0 el10.aarch64 pgdg 44.2 KiB pg_similarity_14-1.0-3PGDG.rhel10.aarch64.rpm
pg_similarity_14 1.0 el10.aarch64 pigsty 43.0 KiB pg_similarity_14-1.0-2PIGSTY.el10.aarch64.rpm
postgresql-14-similarity 1.0 d12.x86_64 pgdg 99.5 KiB postgresql-14-similarity_1.0-9.pgdg12+1_amd64.deb
postgresql-14-similarity 1.0 d12.aarch64 pgdg 96.8 KiB postgresql-14-similarity_1.0-9.pgdg12+1_arm64.deb
postgresql-14-similarity 1.0 d13.x86_64 pgdg 99.4 KiB postgresql-14-similarity_1.0-9.pgdg13+1_amd64.deb
postgresql-14-similarity 1.0 d13.aarch64 pgdg 96.9 KiB postgresql-14-similarity_1.0-9.pgdg13+1_arm64.deb
postgresql-14-similarity 1.0 u22.x86_64 pgdg 105.4 KiB postgresql-14-similarity_1.0-9.pgdg22.04+1_amd64.deb
postgresql-14-similarity 1.0 u22.aarch64 pgdg 102.8 KiB postgresql-14-similarity_1.0-9.pgdg22.04+1_arm64.deb
postgresql-14-similarity 1.0 u24.x86_64 pgdg 99.3 KiB postgresql-14-similarity_1.0-9.pgdg24.04+1_amd64.deb
postgresql-14-similarity 1.0 u24.aarch64 pgdg 96.4 KiB postgresql-14-similarity_1.0-9.pgdg24.04+1_arm64.deb

Source

pig build pkg pg_similarity;		# build rpm

Install

Make sure PGDG and PIGSTY repo available:

pig repo add pgsql -u   # add both repo and update cache

Install this extension with pig:

pig install pg_similarity;		# install via package name, for the active PG version

pig install pg_similarity -v 18;   # install for PG 18
pig install pg_similarity -v 17;   # install for PG 17
pig install pg_similarity -v 16;   # install for PG 16
pig install pg_similarity -v 15;   # install for PG 15
pig install pg_similarity -v 14;   # install for PG 14

Create this extension with:

CREATE EXTENSION pg_similarity;

Usage

pg_similarity: Support similarity queries on PostgreSQL. Source: README.md

pg_similarity is an extension to support similarity queries on PostgreSQL. The implementation is tightly integrated in the RDBMS in the sense that it defines operators so instead of the traditional operators (= and <>) you can use ~~~ and ~!~ (any of these operators represents a similarity function).

pg_similarity has three main components:

  • Functions: a set of functions that implements similarity algorithms available in the literature. These functions can be used as UDFs and will be the base for implementing the similarity operators;
  • Operators: a set of operators defined at the top of similarity functions. They use similarity functions to obtain the similarity threshold and compare its value to a user-defined threshold to decide if it is a match or not;
  • Session Variables: a set of variables that store similarity function parameters. These variables can be defined at run time.

Functions and Operators

This extension supports a set of similarity algorithms. The most known algorithms are covered by this extension. You must be aware that each algorithm is suited for a specific domain. The following algorithms are provided:

  • L1 Distance (as known as City Block or Manhattan Distance)
  • Cosine Distance
  • Dice Coefficient
  • Euclidean Distance
  • Hamming Distance
  • Jaccard Coefficient
  • Jaro Distance
  • Jaro-Winkler Distance
  • Levenshtein Distance
  • Matching Coefficient
  • Monge-Elkan Coefficient
  • Needleman-Wunsch Coefficient
  • Overlap Coefficient
  • Q-Gram Distance
  • Smith-Waterman Coefficient
  • Smith-Waterman-Gotoh Coefficient
  • Soundex Distance
Algorithm Function Operator Use Index? Parameters
L1 Distance block(text, text) returns float8 ~++ yes pg_similarity.block_tokenizer, pg_similarity.block_threshold, pg_similarity.block_is_normalized
Cosine Distance cosine(text, text) returns float8 ~## yes pg_similarity.cosine_tokenizer, pg_similarity.cosine_threshold, pg_similarity.cosine_is_normalized
Dice Coefficient dice(text, text) returns float8 ~-~ yes pg_similarity.dice_tokenizer, pg_similarity.dice_threshold, pg_similarity.dice_is_normalized
Euclidean Distance euclidean(text, text) returns float8 ~!! yes pg_similarity.euclidean_tokenizer, pg_similarity.euclidean_threshold, pg_similarity.euclidean_is_normalized
Hamming Distance hamming(bit varying, bit varying) returns float8 / hamming_text(text, text) returns float8 ~@~ no pg_similarity.hamming_threshold, pg_similarity.hamming_is_normalized
Jaccard Coefficient jaccard(text, text) returns float8 ~?? yes pg_similarity.jaccard_tokenizer, pg_similarity.jaccard_threshold, pg_similarity.jaccard_is_normalized
Jaro Distance jaro(text, text) returns float8 ~%% no pg_similarity.jaro_threshold, pg_similarity.jaro_is_normalized
Jaro-Winkler Distance jarowinkler(text, text) returns float8 ~@@ no pg_similarity.jarowinkler_threshold, pg_similarity.jarowinkler_is_normalized
Levenshtein Distance lev(text, text) returns float8 ~== no pg_similarity.levenshtein_threshold, pg_similarity.levenshtein_is_normalized
Matching Coefficient matchingcoefficient(text, text) returns float8 ~^^ yes pg_similarity.matching_tokenizer, pg_similarity.matching_threshold, pg_similarity.matching_is_normalized
Monge-Elkan Coefficient mongeelkan(text, text) returns float8 ~|| no pg_similarity.mongeelkan_tokenizer, pg_similarity.mongeelkan_threshold, pg_similarity.mongeelkan_is_normalized
Needleman-Wunsch Coefficient needlemanwunsch(text, text) returns float8 ~#~ no pg_similarity.nw_threshold, pg_similarity.nw_is_normalized
Overlap Coefficient overlapcoefficient(text, text) returns float8 ~** yes pg_similarity.overlap_tokenizer, pg_similarity.overlap_threshold, pg_similarity.overlap_is_normalized
Q-Gram Distance qgram(text, text) returns float8 ~~~ yes pg_similarity.qgram_threshold, pg_similarity.qgram_is_normalized
Smith-Waterman Coefficient smithwaterman(text, text) returns float8 ~=~ no pg_similarity.sw_threshold, pg_similarity.sw_is_normalized
Smith-Waterman-Gotoh Coefficient smithwatermangotoh(text, text) returns float8 ~!~ no pg_similarity.swg_threshold, pg_similarity.swg_is_normalized
Soundex Distance soundex(text, text) returns float8 ~*~ no

Parameters

The several parameters control the behavior of the pg_similarity functions and operators. They can be classified in three classes:

  • tokenizer: controls how the strings are tokenized. Valid values are alnum, gram, word, and camelcase. All tokens are lowercase. Default is alnum.
    • alnum: delimiters are any non-alphanumeric characters.
    • gram: an n-gram is a subsequence of length n, extracted using sliding-by-one technique.
    • word: delimiters are white space characters.
    • camelcase: delimiters are capitalized characters but they are also included as first token characters.
  • threshold: controls how flexible the result set will be. Values range from 0.0 to 1.0. Default is 0.7.
  • normalized: controls whether the similarity coefficient/distance is normalized (between 0.0 and 1.0) or not. Default is true.

Examples

Set parameters at run time:

SHOW pg_similarity.levenshtein_threshold;
-- 0.7

SET pg_similarity.levenshtein_threshold TO 0.5;

SET pg_similarity.cosine_tokenizer TO camelcase;

SET pg_similarity.euclidean_is_normalized TO false;

Simple tables for examples:

CREATE TABLE foo (a text);
INSERT INTO foo VALUES('Euler'),('Oiler'),('Euler Taveira de Oliveira'),('Maria Taveira dos Santos'),('Carlos Santos Silva');

CREATE TABLE bar (b text);
INSERT INTO bar VALUES('Euler T. de Oliveira'),('Euller'),('Oliveira, Euler Taveira'),('Sr. Oliveira');

Using similarity functions

SELECT a, b, cosine(a,b), jaro(a, b), euclidean(a, b) FROM foo, bar;

Using the levenshtein operator (~==)

SHOW pg_similarity.levenshtein_threshold;
-- 0.7

SELECT a, b, lev(a,b) FROM foo, bar WHERE a ~== b;
--              a             |          b           |   lev
-- ---------------------------+----------------------+----------
--  Euler                     | Euller               | 0.833333
--  Euler Taveira de Oliveira | Euler T. de Oliveira |     0.76

SET pg_similarity.levenshtein_threshold TO 0.5;

SELECT a, b, lev(a,b) FROM foo, bar WHERE a ~== b;
--              a             |          b           |   lev
-- ---------------------------+----------------------+----------
--  Euler                     | Euller               | 0.833333
--  Oiler                     | Euller               |      0.5
--  Euler Taveira de Oliveira | Euler T. de Oliveira |     0.76

Using the qgram operator (~~~)

SET pg_similarity.qgram_threshold TO 0.7;

SELECT a, b, qgram(a, b) FROM foo, bar WHERE a ~~~ b;
--              a             |            b            |  qgram
-- ---------------------------+-------------------------+----------
--  Euler                     | Euller                  |      0.8
--  Euler Taveira de Oliveira | Euler T. de Oliveira    |  0.77551
--  Euler Taveira de Oliveira | Oliveira, Euler Taveira | 0.807692

Comparing different operators

SELECT * FROM bar WHERE b ~@@ 'euler'; -- jaro-winkler operator
SELECT * FROM bar WHERE b ~~~ 'euler'; -- qgram operator
SELECT * FROM bar WHERE b ~== 'euler'; -- levenshtein operator
SELECT * FROM bar WHERE b ~## 'euler'; -- cosine operator
Last updated on