前
言
拓数派旗下云原生向量数据库 PieCloudVector 采用了将业界成熟开源算法实现与自研的基于 postgres 内核的关系型数据库对接起来的技术路线,能够存储和管理原始数据的向量表示,同时支持精确查询与模糊查询功能,用户可以通过 Postgres 客户端进行高效的相似性搜索。
PieCloudVector 利用其分布式架构显著提升了向量计算的效率,并在此基础上提供了一整套上下游工具。在技术架构上,PieCloudVector 按照向量数据的实际应用流程划分为五个核心层级:原始数据存储、嵌入(Embedding)、索引构建、向量检索以及数据应用。这些层级分别对应于向量数据处理和分析过程中的不同应用场景,构成了一个完整的技术框架,如下图所示:
01. 数据集准备
from datasets import load_dataset
dataset = load_dataset("fashion_mnist", split="train[:1000]")
print(dataset.features)
{'image': Image(decode=True, id=None),
'label': CtassLabel(names=['T - shirt / top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'],
id=None)}
接下来,对数据进行向量化处理,运用合适的模型将其转换为数值数组,从而生成能够代表原始数据特征的嵌入向量。
以第一条数据为例,目的是找到前 10 个与其相似的服装作为推荐商品,其对应的图片如下图所示,类型为 Ankle boot(短靴)。
import numpy as np
from imgbeddings import imgbeddings
ibed = imgbeddings()
embedding_0 = ibed.to_embeddings(dataset['image'][0])
In: embedding_0.shape
(1, 768)
embedding [0]
array([-2.11191326e-01, 2.07909331e-01, -6.71815038e-01, -1.66335583e+00,
-1.57210135e+00, -5.19429862e-01, -8.80079985e-01, 2.29999766e-01,
1.67191553e+00, -9.89815831e-01, 6.54723167e-01, -2.75861591e-01,
5.89815438e-01, 2.61584610e-01, 8.86729777e-01, 5.67858696e-01,
4.75497782e-01, 3.40062588e-01, -4.25924629e-01, 8.74885023e-01,
-3.10492903e-01, 2.72458225e-01, -3.28680307e-01, -4.51324023e-02,
-6.83538735e-01, -2.32427925e-01, 5.95779240e-01, 5.50612807e-01,
7.26937175e-01, 6.75487295e-02, -7.40724325e-01, -2.07319453e-01,
1.37214720e-01, 1.55591702e+00, 1.24170937e-01, -3.53575408e-01,
-7.43186593e-01, 9.77323204e-02, 4.97219563e-02, 1.00773001e+00,
1.24602437e+00, -1.76177248e-01, 5.85671842e-01, -4.85404104e-01,
-5.25022328e-01, -1.84076607e-01, -4.65092547e-02, 7.65870810e-01,
1.27615702e+00, 7.38422930e-01, 2.59102374e-01, 5.86230934e-01,
-1.34280175e-01, -4.21402991e-01, 1.31635904e-01, 6.08720705e-02,
3.83820683e-01, 9.36180592e-01, 4.59356755e-02, 3.50226104e-01,
-5.04337013e-01, -5.55240333e-01, -7.46359229e-02, 3.54337037e-01,
-6.38039052e-01, 8.85763526e-01, -2.85562664e-01, 9.87186372e-01,
1.74211636-01, -4.21855748e-02, 2.725174430-01, -3.59927297e-01
...
embedding = ibed.to_embeddings(dataset['image'][1:])
CREATE TABLE pictures (id bigserial PRIMARY KEY, embedding vector(768));
import psycopg2
conn = psycopg2.connect('postgresql://usr:pswd@192.138.***.***:5432/db')
cur = conn.cursor()
embedding_lst = embedding.tolist()
for i in range(len(embedding_lst)):
cur.execute('INSERT INTO pictures (embedding) values (%s)', (embedding_lst[i],))
conn.commit()
conn.close()
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('postgresql://usr:pswd@192.138.***.***:5432/db', echo=False)
img_id = pd.read_sql('select id from pictures where id != 1 order by embedding <-> ' + "'" + str(embedding_0.tolist()[0]) + "'" + ' limit 10',
con=engine)
id_lst = img_id['id'].to_list()
for i in id_lst[:5]:
display(dataset['image'][i])
def most_common(lst):
return max(set(lst), key=lst.count)
label = most_common([dataset['label'][i] for i in id_lst])
print(dataset.features["label"].int2str(label))
dataset.features["label"].int2str(label)
'Ankle boot'
关于 PieCloudVector