Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

examples: use new vector index syntax #70

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions examples/image_search/example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,7 @@
"\n",
" id = Column(Integer, primary_key=True)\n",
" image_id = Column(Integer)\n",
" embedding = Column(\n",
" VectorType(CLIP_DIMENSION),\n",
" # using hnsw index with cosine distance\n",
" comment=\"hnsw(distance=cosine)\"\n",
" )\n",
" embedding = Column(VectorType(CLIP_DIMENSION))\n",
"\n",
"Base.metadata.drop_all(engine)\n",
"Base.metadata.create_all(engine)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ class Document(Base):
# DIMENSIONS is determined by the embedding model,
# for Jina AI's jina-embeddings-v2-base-en model it's 768.
VectorType(dim=768),
comment="hnsw(distance=cosine)"
)


Expand Down
5 changes: 3 additions & 2 deletions examples/orm-peewee-quickstart/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ TIDB_PORT=4000
TIDB_USERNAME=******.root
TIDB_PASSWORD=********
TIDB_DATABASE=test
# For macOS. For other platforms, please refer https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters#root-certificate-default-path.
TIDB_CA_PATH=/etc/ssl/cert.pem
# TiDB Serverless Cluster requires SSL connection for public network access.
# For local TiDB cluster, please set TIDB_SSL=false to disable SSL.
TIDB_SSL=true
17 changes: 11 additions & 6 deletions examples/orm-peewee-quickstart/peewee-quickstart.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import os
import dotenv

from peewee import Model, MySQLDatabase, SQL, TextField
from tidb_vector.peewee import VectorField
from tidb_vector.peewee import VectorField, VectorAdaptor
from tidb_vector.constants import DistanceMetric
from peewee import Model, MySQLDatabase, TextField

dotenv.load_dotenv()

# Step 1: Connect to TiDB using Peewee.

# Using `pymysql` as the driver.
connect_kwargs = {
ssl_kwargs = {
'ssl_verify_cert': True,
'ssl_verify_identity': True,
}

# Using `mysqlclient` as the driver.
# connect_kwargs = {
# ssl_kwargs = {
# 'ssl_mode': 'VERIFY_IDENTITY',
# 'ssl': {
# # Root certificate default path
Expand All @@ -30,7 +31,7 @@
password=os.environ.get('TIDB_PASSWORD', ''),
host=os.environ.get('TIDB_HOST', 'localhost'),
port=int(os.environ.get('TIDB_PORT', '4000')),
**connect_kwargs,
**ssl_kwargs if os.environ.get('TIDB_SSL', 'false').lower() == 'true' else {},
)


Expand All @@ -53,12 +54,16 @@ class Meta:
table_name = 'peewee_demo_documents_with_index'

content = TextField()
embedding = VectorField(3, constraints=[SQL("COMMENT 'hnsw(distance=cosine)'")])
embedding = VectorField(3)


db.connect()
db.drop_tables([Document, DocumentWithIndex])
db.create_tables([Document, DocumentWithIndex])
VectorAdaptor(db).create_vector_index(
DocumentWithIndex.embedding,
DistanceMetric.COSINE,
)

# Step 3. Insert embeddings into the table.
Document.create(content='dog', embedding=[1, 2, 1])
Expand Down
9 changes: 8 additions & 1 deletion examples/orm-sqlalchemy-quickstart/.env.example
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
TIDB_DATABASE_URL=mysql+pymysql://<USERNAME>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA>&ssl_verify_cert=true&ssl_verify_identity=true
TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com
TIDB_PORT=4000
TIDB_USERNAME=******.root
TIDB_PASSWORD=********
TIDB_DATABASE=test
# TiDB Serverless Cluster requires SSL connection for public network access.
# For local TiDB cluster, please set TIDB_SSL=false to disable SSL.
TIDB_SSL=true
43 changes: 38 additions & 5 deletions examples/orm-sqlalchemy-quickstart/sqlalchemy-quickstart.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,43 @@
import os
import dotenv

from sqlalchemy import Column, Integer, create_engine, Text
from sqlalchemy import Column, Integer, create_engine, Text, URL
from sqlalchemy.orm import declarative_base, Session
from tidb_vector.sqlalchemy import VectorType
from tidb_vector.sqlalchemy import VectorType, VectorAdaptor
from tidb_vector.constants import DistanceMetric

dotenv.load_dotenv()

# Step 1: Connect to TiDB using SQLAlchemy.
tidb_connection_string = os.environ['TIDB_DATABASE_URL']
engine = create_engine(tidb_connection_string)

# Using `pymysql` as the driver.
drivername = 'mysql+pymysql'
ssl_kwargs = {
'ssl_verify_cert': 'true',
'ssl_verify_identity': 'true',
}

# Using `mysqlclient` as the driver.
# drivername = 'mysql+mysqldb'
# ssl_kwargs = {
# 'ssl_mode': 'VERIFY_IDENTITY',
# 'ssl': {
# # Root certificate default path
# # https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters/#root-certificate-default-path
# 'ca': os.environ.get('TIDB_CA_PATH', '/path/to/ca.pem'),
# },
# }

engine = create_engine(URL.create(
drivername=drivername,
username=os.environ['TIDB_USERNAME'],
password=os.environ['TIDB_PASSWORD'],
host=os.environ['TIDB_HOST'],
port=os.environ['TIDB_PORT'],
database=os.environ['TIDB_DATABASE'],
query=ssl_kwargs if os.environ.get('TIDB_SSL', 'false').lower() == 'true' else {},
))


# Step 2: Define a table with a vector column.
Base = declarative_base()
Expand All @@ -27,11 +55,16 @@ class DocumentWithIndex(Base):
__tablename__ = 'sqlalchemy_demo_documents_with_index'
id = Column(Integer, primary_key=True)
content = Column(Text)
embedding = Column(VectorType(3), comment="hnsw(distance=cosine)")
embedding = Column(VectorType(3))


Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
VectorAdaptor(engine).create_vector_index(
DocumentWithIndex.embedding,
DistanceMetric.COSINE,
skip_existing=True,
)


# Step 3: Insert embeddings into the table.
Expand Down
1 change: 0 additions & 1 deletion examples/semantic-cache/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ class Cache(SQLModel, table=True):
sa_column=Column(
VectorType(768),
default=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also using create_vector_index instead?

Copy link
Collaborator Author

@wd0517 wd0517 Nov 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vector index syntax is strange, I suggest only show it in orm examples. Core team will introduce new vector index creating syntax later.

Copy link
Member

@breezewish breezewish Nov 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess there might be no high-priority plan of introducing a new syntax in recent versions, because ORM users are already using a good encapsulated one. Even after a new syntax is introduced I don't think we could avoid using Adaptor in all ORMs, because of the syntax prefix like ADD VECTOR INDEX. For this reason I guess the new syntax is mainly for ORM adaptors to have a simple implementation (while taking care of TiFlash multi replicas). How users use the ORM might not change, or at least it is not the goal.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR introduced a new VectorIndex which can replace current VectorAdapter.

comment="hnsw(distance=l2)",
nullable=False,
)
)
Expand Down
Loading