From b0d1f673fbec7a9346f4adb3bc4d81d9ada4a73f Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Wed, 17 Jan 2024 15:50:31 +0000 Subject: [PATCH 01/10] Adding Azure Cosmos Db Nosql code examples --- code_samples/azure_cosmosdb_nosql/README.md | 57 ++ .../azure_cosmosdb_nosql_conda.yml | 75 +++ .../cosmos_ingestion.ipynb | 613 ++++++++++++++++++ .../cosmosdb_vector_query.ipynb | 462 +++++++++++++ 4 files changed, 1207 insertions(+) create mode 100644 code_samples/azure_cosmosdb_nosql/README.md create mode 100644 code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml create mode 100644 code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb create mode 100644 code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb diff --git a/code_samples/azure_cosmosdb_nosql/README.md b/code_samples/azure_cosmosdb_nosql/README.md new file mode 100644 index 0000000..153c7ef --- /dev/null +++ b/code_samples/azure_cosmosdb_nosql/README.md @@ -0,0 +1,57 @@ +# Azure CosmosDb Samples + +This folder includes the notebooks to demonstrate vector search capabilities of [Azure CosmosDb NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/) for text and documents + +## Run the Code Locally + +Follow the steps to run the code locally. + +1. The samples uses Conda to manage virtual environments. Create a conda environment using the [azure_cosmosdb_nosql_conda.yml](./azure_cosmosdb_postgresql_conda.yml) file to include all necessary python dependencies. + + `conda env create -f azure_cosmosdb_nosql_conda.yml` + +2. Create a *.env* file from the *.env-template* and populate it with all necessary keys. + +3. Finally, follow the instructions mentioned here to run the code locally using VS Code - [Run the Code Locally](../README.md#run-the-code-locally) + +## Resources Deployment + +- Azure CosmosDb + + *Create resource* + + Augment the Azure Cosmos DB data with semantic and vector search capabilities of Azure AI Search.. + + For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameters values in `params` section according to your environment, and run the following command. + + `az deployment group create --resource-group resource_group_name --template-file azure_cosmosdb_nosql.bicep` + + + +- Azure OpenAI + + Azure OpenAI Service resource can be deployed using [Azure Portal](https://learn.microsoft.com/azure/ai-services/openai/how-to/create-resource?pivots=web-portal), [Azure CLI](https://learn.microsoft.com/azure/ai-services/openai/how-to/create-resource?pivots=cli) or [Azure PowerShell](https://learn.microsoft.com/azure/ai-services/openai/how-to/create-resource?pivots=ps). Again, [private endpoints](https://learn.microsoft.com/azure/ai-services/cognitive-services-virtual-networks?context=%2Fazure%2Fai-services%2Fopenai%2Fcontext%2Fcontext&tabs=portal#use-private-endpoints) can be used for Azure AI services resources to allow clients on a virtual network to securely access data over Azure Private Link. + +## Datasets + +- [text](../data/text/) - for text search sample + +- [docs](../data/docs/) - for document search sample + + + +## Sample Notebooks + +- [cosmos_ingestion.ipynb](./cosmos_ingestion.ipynb) +- [cosmos_ingestion.ipynb](./cosmos_ingestion.ipynb) + +## Reference + +- [Vector database - Azure Cosmos DB | Microsoft Learn](https://learn.microsoft.com/en-us/azure/cosmos-db/vector-database#implement-vector-database-functionalities-using-our-nosql-api-and-ai-search) +- [Azure AI Search Documentation](https://learn.microsoft.com/azure/search/) + - [Retrieval Augmented Generation (RAG) in Azure AI Search](https://learn.microsoft.com/azure/search/retrieval-augmented-generation-overview) + - [Vector search overview](https://learn.microsoft.com/azure/search/vector-search-overview) + - [Hybrid search overview](https://learn.microsoft.com/azure/search/hybrid-search-overview) + - [Create a vector index](https://learn.microsoft.com/azure/search/vector-search-how-to-create-index) + - [Query a vector index](https://learn.microsoft.com/azure/search/vector-search-how-to-query) + - [Vector search algorithms](https://learn.microsoft.com/azure/search/vector-search-ranking) \ No newline at end of file diff --git a/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml b/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml new file mode 100644 index 0000000..e62a21b --- /dev/null +++ b/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml @@ -0,0 +1,75 @@ +name: Cosmos_Nosql +channels: + - conda-forge + - defaults +dependencies: + - blas=1.0=mkl + - brotli-python=1.0.9=py312hd77b12b_7 + - bzip2=1.0.8=he774522_0 + - ca-certificates=2023.12.12=haa95532_0 + - certifi=2023.11.17=py312haa95532_0 + - cffi=1.16.0=py312h2bbff1b_0 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.1.7=py312haa95532_0 + - colorama=0.4.6=py312haa95532_0 + - cryptography=41.0.7=py312h89fc84f_0 + - expat=2.5.0=hd77b12b_0 + - idna=3.4=py312haa95532_0 + - intel-openmp=2023.1.0=h59b6b97_46320 + - libffi=3.4.4=hd77b12b_0 + - mkl=2023.1.0=h6b88ed4_46358 + - mkl-service=2.4.0=py312h2bbff1b_1 + - mkl_fft=1.3.8=py312h2bbff1b_0 + - mkl_random=1.2.4=py312h59b6b97_0 + - numpy=1.26.3=py312hfd52020_0 + - numpy-base=1.26.3=py312h4dde369_0 + - openssl=3.0.12=h2bbff1b_0 + - pip=23.3.1=py312haa95532_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyopenssl=23.2.0=py312haa95532_0 + - pysocks=1.7.1=py312haa95532_0 + - python=3.12.0=h1d929f7_0 + - python-dotenv=0.21.0=py312haa95532_0 + - requests=2.31.0=py312haa95532_0 + - setuptools=68.2.2=py312haa95532_0 + - sqlite=3.41.2=h2bbff1b_0 + - tbb=2021.8.0=h59b6b97_0 + - tk=8.6.12=h2bbff1b_0 + - tzdata=2023d=h04d1e81_0 + - urllib3=1.26.18=py312haa95532_0 + - vc=14.2=h21ff451_1 + - vs2015_runtime=14.27.29016=h5e58377_2 + - wheel=0.41.2=py312haa95532_0 + - win_inet_pton=1.1.0=py312haa95532_0 + - xz=5.4.5=h8cc25b3_0 + - zlib=1.2.13=h8cc25b3_0 + - pip: + - annotated-types==0.6.0 + - anyio==4.2.0 + - certifi==2023.11.17 + - charset-normalizer==3.3.2 + - click==8.1.7 + - contourpy==1.2.0 + - cycler==0.12.1 + - distro==1.9.0 + - filelock==3.13.1 + - fonttools==4.47.0 + - fsspec==2023.12.2 + - h11==0.14.0 + - httpcore==1.0.2 + - httpx==0.26.0 + - idna==3.6 + - jinja2==3.1.2 + - joblib==1.3.2 + - markupsafe==2.1.3 + - matplotlib==3.8.2 + - networkx==3.2.1 + - numpy==1.26.2 + - openai==1.6.1 + - pandas==2.1.4 + - python-dotenv==1.0.0 + - regex==2023.12.25 + - requests==2.31.0 + - safetensors==0.4.1 + - scikit-learn==1.3.2 + - scipy==1.11.4 diff --git a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb new file mode 100644 index 0000000..3856ed8 --- /dev/null +++ b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb @@ -0,0 +1,613 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Ingestion to COSMOSDB \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install numpy\n", + "! pip install openai\n", + "! pip install python-dotenv\n", + "! pip install azure-core \n", + "! pip install azure-cosmos\n", + "! pip install tenacity\n", + "! pip install azure-search-documents\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import datetime\n", + "import time\n", + "\n", + "from azure.core.exceptions import AzureError\n", + "from azure.core.credentials import AzureKeyCredential\n", + "from azure.cosmos import exceptions, CosmosClient, PartitionKey\n", + "from azure.search.documents import SearchClient\n", + "from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient\n", + "from azure.search.documents.models import Vector\n", + "from azure.search.documents.indexes.models import (\n", + " IndexingSchedule,\n", + " SearchIndex,\n", + " SearchIndexer,\n", + " SearchIndexerDataContainer,\n", + " SearchField,\n", + " SearchFieldDataType,\n", + " SearchableField,\n", + " SemanticConfiguration,\n", + " SimpleField,\n", + " PrioritizedFields,\n", + " SemanticField,\n", + " SemanticSettings,\n", + " VectorSearch,\n", + " VectorSearchAlgorithmConfiguration,\n", + " SearchIndexerDataSourceConnection\n", + ")\n", + "\n", + "from azure.core.credentials import AzureKeyCredential\n", + "from azure.core.exceptions import HttpResponseError\n", + "\n", + "import openai\n", + "from tenacity import retry, wait_random_exponential, stop_after_attempt\n", + "\n", + "import pandas as pd\n", + "from azure.cosmos import CosmosClient, partition_key, exceptions\n", + "\n", + "import os\n", + "from dotenv import load_dotenv\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Enviroment Variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "load_dotenv()\n", + "\n", + "cosmos_db_api_endpoint = os.getenv(\"cosmos_db_api_endpoint\")\n", + "if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == \"\":\n", + " print(\"cosmos_db_api_endpoint environment variable not set.\")\n", + " exit()\n", + "\n", + "cosmos_db_api_key = os.getenv(\"cosmos_db_api_key\")\n", + "if cosmos_db_api_key is None or cosmos_db_api_key == \"\":\n", + " print(\"cosmos_db_api_key environment variable not set.\")\n", + " exit()\n", + "\n", + "cog_search_endpoint = os.getenv(\"cog_search_endpoint\")\n", + "if cog_search_endpoint is None or cog_search_endpoint == \"\":\n", + " print(\"cog_search_endpoint environment variable not set.\")\n", + " exit()\n", + "\n", + "cog_search_key = os.getenv(\"cog_search_key\")\n", + "if cog_search_key is None or cog_search_key == \"\":\n", + " print(\"cog_search_key environment variable not set.\")\n", + " exit()\n", + "\n", + "cosmos_db_connection_string = os.getenv(\"cosmos_db_connection_string\")\n", + "if cosmos_db_connection_string is None or cosmos_db_connection_string == \"\":\n", + " print(\"cog_search_key environment variable not set.\")\n", + " exit()\n", + " \n", + "\n", + "text_table_name = 'text_sample'\n", + "doc_table_name = 'doc_sample'\n", + "image_table_name = 'image_sample'\n", + "\n", + "database_name = \"Vector_DB\"\n", + "credential = AzureKeyCredential(str(cog_search_key))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cosmos ( Nosql)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Opening the connections.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.cosmos import CosmosClient\n", + "\n", + "# Your Cosmos DB connection parameters\n", + "database_name = \"Vector_DB\" #####Replace here the name you want to use for your Database####\n", + "\n", + "# Initialize the Cosmos DB client\n", + "client = CosmosClient(cosmos_db_api_endpoint, credential=cosmos_db_api_key)\n", + "\n", + "# Create or get a reference to the database\n", + "database = client.create_database_if_not_exists(id=database_name)\n", + "\n", + "print(f\"Database {database_name} created or retrieved.\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Initialize the Cosmos DB client - Creating containers\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Function for new Container" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "## this adaptation, the script uses the Cosmos DB Python SDK to create items in the Cosmos DB container. \n", + "# The to_dict(orient='records') method is used to convert the Pandas DataFrame to a list of dictionaries, where each dictionary represents a record\n", + "# Function to insert data into Cosmos DB\n", + "def new_container(container):\n", + " try:\n", + " partition_key_ = PartitionKey(path=\"/id\")\n", + " container = database.create_container_if_not_exists(\n", + " id=container,\n", + " partition_key=partition_key_ )\n", + "\n", + " print(f\"Document {container} created successfully\")\n", + "\n", + " except exceptions.CosmosResourceExistsError as e:\n", + " print(\"Container already exists.\")\n", + "\n", + " except Exception as e:\n", + " # Handle other exceptions\n", + " print(f\"Error: {e}\")\n", + "\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Text ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from azure.cosmos import CosmosClient\n", + "\n", + "\n", + "cosmosdb_container_name = text_table_name\n", + "container = database.get_container_client(cosmosdb_container_name)\n", + "\n", + "# Read data from the JSON file\n", + "text_df = pd.read_json('../data/text/product_docs_embeddings.json')\n", + "records = text_df.to_dict(orient='records')\n", + "\n", + "\n", + "# Create cntainer\n", + "new_container(cosmosdb_container_name)\n", + "\n", + "\n", + "# Iterate through the data and insert the files with the embeddings into the container\n", + "try:\n", + " for item in records:\n", + " title = item['title']\n", + " content = item['content']\n", + " item['titleVector'] = item['title_vector']\n", + " item['contentVector'] = item['content_vector']\n", + " item['@search.action'] = 'upload'\n", + "\n", + " # Convert the 'id' attribute to a string\n", + " item['id'] = str(item['id'])\n", + "\n", + " # Insert the item into the container\n", + " container.create_item(body=item)\n", + "\n", + " print(f\"Data items inserted into the Cosmos DB {cosmosdb_container_name}\")\n", + "\n", + "except exceptions.CosmosResourceExistsError as e:\n", + " # Handle conflict error\n", + " print(f\"Document {container} with ID {item['id']} already exists...\")\n", + " print(f\"Error: {e}\")\n", + "\n", + " # Implement your logic to update the existing document or take appropriate action\n", + "\n", + "except Exception as e:\n", + " # Handle other exceptions\n", + " print(f\"Error: {e}\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Doc ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cosmosdb_container_name = doc_table_name\n", + "container = database.get_container_client(cosmosdb_container_name)\n", + "\n", + "# Read data from the JSON file\n", + "doc_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')\n", + "records = doc_df.to_dict(orient='records')\n", + "\n", + "# Create cntainer\n", + "new_container(cosmosdb_container_name)\n", + "\n", + "# Iterate through the data and insert the files with the embeddings into the container\n", + "try:\n", + " for item in records:\n", + " chunk_content = item['chunk_content']\n", + " item['chunk_content_vector'] = item['chunk_content_vector']\n", + " item['@search.action'] = 'upload'\n", + "\n", + " # Convert the 'id' attribute to a string\n", + " item['id'] = str(item['id'])\n", + "\n", + " # Insert the item into the container\n", + " container.create_item(body=item)\n", + "\n", + " print(f\"Data items inserted into the Cosmos DB {cosmosdb_container_name}\")\n", + "\n", + "except exceptions.CosmosResourceExistsError as e:\n", + " # Handle conflict error\n", + " print(f\"Document {container} with ID {item['id']} already exists...\")\n", + " print(f\"Error: {e}\")\n", + "\n", + " # Implement your logic to update the existing document or take appropriate action\n", + "\n", + "except Exception as e:\n", + " # Handle other exceptions\n", + " print(f\"Error: {e}\")\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Checking the data inserted (optional)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Specify the container name\n", + "Table_name = \"text_sample\"\n", + "container = database.get_container_client(Table_name)\n", + "\n", + "#Number of rows - Top 10 for example\n", + "top_x_rows = 10\n", + "\n", + "print(f\"Quality test top ( {top_x_rows} )\")\n", + "\n", + "query = f\"SELECT TOP {top_x_rows} * FROM c\"\n", + "\n", + "# Execute the query\n", + "query_result = container.query_items(query, enable_cross_partition_query=True)\n", + "\n", + "# Process the query result\n", + "for item in query_result:\n", + " print(item)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create HSNW Index\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# https://learn.microsoft.com/en-gb/azure/search/search-create-service-portal\n", + "\n", + "\n", + "# Semantic search configuration\n", + "config_text = SemanticConfiguration(\n", + " name=\"ConfigSemantictext\",\n", + " prioritized_fields=PrioritizedFields(\n", + " title_field=SemanticField(field_name=\"title\"),\n", + " prioritized_keywords_fields=[SemanticField(field_name=\"category\")],\n", + " prioritized_content_fields=[SemanticField(field_name=\"content\")]\n", + " )\n", + ")\n", + "\n", + "# Semantic search configuration\n", + "config_doc = SemanticConfiguration(\n", + " name=\"ConfigSemanticdoc\",\n", + " prioritized_fields=PrioritizedFields(\n", + " title_field=SemanticField(field_name=\"chunk_content\")\n", + " )\n", + ")\n", + "\n", + "\n", + "\n", + "# Create the configurration\n", + "settings_text = SemanticSettings(configurations=[config_text])\n", + "settings_doc = SemanticSettings(configurations=[config_doc])\n", + "\n", + "\n", + "\n", + "# Vector search configuration\n", + "vector_search_config = VectorSearchAlgorithmConfiguration(\n", + " name=\"vector-cosmos-config\",\n", + " kind=\"hnsw\",\n", + " hnsw_parameters={\n", + " \"m\": 4,\n", + " \"efConstruction\": 400,\n", + " \"efSearch\": 1000,\n", + " \"metric\": \"cosine\"\n", + " }\n", + ")\n", + "\n", + "try:\n", + " index_client = SearchIndexClient(endpoint=cog_search_endpoint, credential=cog_search_cred)\n", + "\n", + "\n", + " # Define index fields for text_sample\n", + " text_index_name = \"text_sample_index\"\n", + " text_fields = [\n", + " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n", + " SearchableField(name=\"title\", type=SearchFieldDataType.String,\n", + " searchable=True, retrievable=True),\n", + " SearchableField(name=\"content\", type=SearchFieldDataType.String,\n", + " searchable=True, retrievable=True),\n", + " SearchableField(name=\"category\", type=SearchFieldDataType.String,\n", + " filterable=True, searchable=True, retrievable=True),\n", + " SearchField(name=\"title_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", + " searchable=True, dimensions=1536, vector_search_configuration=\"vector-cosmos-config\"),\n", + " SearchField(name=\"content_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", + " searchable=True, dimensions=1536, vector_search_configuration=\"vector-cosmos-config\"),\n", + " ]\n", + "\n", + "\n", + "\n", + " # Define index fields for doc_sample\n", + " doc_index_name = \"doc_sample_index\"\n", + " doc_fields = [\n", + " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n", + " SearchableField(name=\"chunk_content\", type=SearchFieldDataType.String,\n", + " searchable=True, retrievable=True),\n", + " SearchField(name=\"chunk_content_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", + " searchable=True, dimensions=1536, vector_search_configuration=\"vector-cosmos-config\"), \n", + " ]\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " # Create indexes\n", + " text_index = SearchIndex(name=text_index_name, fields=text_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]), semantic_settings=settings_text)\n", + " doc_index = SearchIndex(name=doc_index_name, fields=doc_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]), semantic_settings=settings_doc)\n", + "\n", + "\n", + "\n", + " # Create or update indexes\n", + " index_client.create_or_update_index(text_index)\n", + " print(f'Indexes created or updated: {text_index_name}')\n", + " \n", + " index_client.create_or_update_index(doc_index)\n", + " print(f'Indexes created or updated: {doc_index_name}')\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "except HttpResponseError as e:\n", + " print(f\"HTTP Error: {e}\")\n", + " print(f\"Status Code: {e.status_code}\")\n", + " print(f\"Error Message: {e.error.message}\")\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the Azure Cognitve search index\n", + "##### Datsource Function\n", + "##### Indexer Function\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "def create_datasource(data_source_name, cosmos_db_connection_string, table_name, indexer_client):\n", + " try:\n", + " container_cosmos = SearchIndexerDataContainer(\n", + " name=table_name,\n", + " query=f\"SELECT * FROM {table_name} c WHERE c._ts>@HighWaterMark ORDER BY c._ts\"\n", + " )\n", + "\n", + " # Define the data source connection\n", + " data_source_connection = SearchIndexerDataSourceConnection(\n", + " name=data_source_name,\n", + " type=\"cosmosdb\",\n", + " connection_string=cosmos_db_connection_string,\n", + " container=container_cosmos\n", + " )\n", + "\n", + " # Create or update the data source connection\n", + " data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)\n", + "\n", + " except HttpResponseError as ex:\n", + " print(f\"Error: {ex}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "def create_indexer_if_not_exists(\n", + " indexer_name, target_index_name, data_source_name, indexer_client\n", + "\n", + "):\n", + "\n", + " try:\n", + " \n", + " # Create and run the indexer\n", + " indexer = SearchIndexer(\n", + " name=indexer_name,\n", + " data_source_name=data_source_name,\n", + " target_index_name=target_index_name\n", + " )\n", + "\n", + " indexer_client.create_or_update_indexer(indexer)\n", + " indexer_client.run_indexer(indexer_name)\n", + "\n", + " except HttpResponseError as ex:\n", + " print(f\"Error: {ex}\")\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating the datasource and index from the functions already defined\n", + "##### Indexers to crawl data from various data sources and insert them into indexes\n", + "##### Data Sources that connect Azure Cognitive Search to Cosmos" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "##You must have a Cognitve search service already created\n", + "indexer_client = SearchIndexerClient(cog_search_endpoint, credential)\n", + "\n", + "\n", + "data_source_name = 'textsample'\n", + "table_name =text_table_name\n", + "indexer_name = data_source_name\n", + "target_index_name_=text_index_name\n", + "\n", + "create_datasource(data_source_name,cosmos_db_connection_string,table_name, indexer_client)\n", + "create_indexer_if_not_exists(indexer_name, target_index_name_, data_source_name, indexer_client)\n", + "\n", + "\n", + "data_source_name = 'docsample'\n", + "table_name =doc_table_name\n", + "indexer_name = data_source_name\n", + "target_index_name_=doc_index_name\n", + "\n", + "create_datasource(data_source_name,cosmos_db_connection_string,table_name, indexer_client)\n", + "create_indexer_if_not_exists(indexer_name, target_index_name_, data_source_name,indexer_client)\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "postgresql", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb new file mode 100644 index 0000000..58077a5 --- /dev/null +++ b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vector Search " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import datetime\n", + "import time\n", + "\n", + "from azure.core.exceptions import AzureError\n", + "from azure.core.credentials import AzureKeyCredential\n", + "from azure.cosmos import exceptions, CosmosClient, PartitionKey\n", + "from azure.search.documents import SearchClient\n", + "from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient\n", + "from azure.search.documents.models import (\n", + " QueryAnswerType,\n", + " QueryCaptionType,\n", + " QueryLanguage,\n", + " QueryType,\n", + " Vector \n", + ")\n", + "from azure.search.documents.indexes.models import (\n", + " IndexingSchedule,\n", + " SearchIndex,\n", + " SearchIndexer,\n", + " SearchIndexerDataContainer,\n", + " SearchField,\n", + " SearchFieldDataType,\n", + " SearchableField,\n", + " SemanticConfiguration,\n", + " SimpleField,\n", + " PrioritizedFields,\n", + " SemanticField,\n", + " SemanticSettings,\n", + " VectorSearch,\n", + " VectorSearchAlgorithmConfiguration,\n", + " SearchIndexerDataSourceConnection,\n", + " \n", + ")\n", + "\n", + "import openai\n", + "from tenacity import retry, wait_random_exponential, stop_after_attempt\n", + "\n", + "\n", + "import os\n", + "from dotenv import load_dotenv\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enviromnent variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "load_dotenv()\n", + "\n", + "cosmos_db_api_endpoint = os.getenv(\"cosmos_db_api_endpoint\")\n", + "if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == \"\":\n", + " print(\"cosmos_db_api_endpoint environment variable not set.\")\n", + " exit()\n", + "\n", + "cosmos_db_api_key = os.getenv(\"cosmos_db_api_key\")\n", + "if cosmos_db_api_key is None or cosmos_db_api_key == \"\":\n", + " print(\"cosmos_db_api_key environment variable not set.\")\n", + " exit()\n", + "\n", + "cog_search_endpoint = os.getenv(\"cog_search_endpoint\")\n", + "if cog_search_endpoint is None or cog_search_endpoint == \"\":\n", + " print(\"cog_search_endpoint environment variable not set.\")\n", + " exit()\n", + "\n", + "cog_search_key = os.getenv(\"cog_search_key\")\n", + "if cog_search_key is None or cog_search_key == \"\":\n", + " print(\"cog_search_key environment variable not set.\")\n", + " exit()\n", + "\n", + "cosmos_db_connection_string = os.getenv(\"cosmos_db_connection_string\")\n", + "if cosmos_db_connection_string is None or cosmos_db_connection_string == \"\":\n", + " print(\"cosmos_db_connection_string environment variable not set.\")\n", + " exit()\n", + " \n", + "aoai_embedding_deployed_model = os.getenv(\"aoai_embedding_deployed_model\")\n", + "if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == \"\":\n", + " print(\"aoai_embedding_deployed_model environment variable not set.\")\n", + " exit()\n", + " \n", + "\n", + "text_table_name = 'text_sample'\n", + "doc_table_name = 'doc_sample'\n", + "image_table_name = 'image_sample'\n", + "\n", + "database_name = \"Vector_DB\"\n", + "credential = AzureKeyCredential(str(cog_search_key))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Initialize the search" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize Cosmos DB client\n", + "cosmos_client = CosmosClient(cosmos_db_api_endpoint, cosmos_db_api_key)\n", + "database = cosmos_client.get_database_client(database_name)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simple Vector Search\n", + "##### You will use the index name that you created previously on the ingestion steps for the respectively containers\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using Simple Search with Cosine similarity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openai.embeddings_utils import get_embedding, cosine_similarity\n", + "# Query Cosmos DB using Azure Cognitive Search\n", + "\n", + "container_name = 'text_sample'\n", + "index_name = \"text_sample_index\"\n", + "\n", + "container = database.get_container_client(container_name)\n", + "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", + "\n", + "\n", + "query = 'tools for software development'\n", + "query_vector = get_embedding(query, k=3, engine=aoai_embedding_deployed_model)\n", + "\n", + "# Perform Azure Cognitive Search query\n", + "search_results = search_client.search(search_text=query, select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"])\n", + "\n", + "\n", + "for result in search_results:\n", + " result_vector = result.get(\"content_vector\", None)\n", + "\n", + " if result_vector is not None and len(result_vector) > 0:\n", + " similarity_score = cosine_similarity(query_vector, result_vector)\n", + "\n", + " print(f\"Title: {result['title']}\")\n", + " print(f\"Score: {result['@search.score']}\")\n", + " print(f\"Content: {result['content']}\")\n", + " print(f\"Category: {result['category']}\")\n", + " print(f\"Cosine Similarity: {similarity_score}\\n\")\n", + " else:\n", + " print(f\"Skipping result with empty or missing vector.\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using Simple Search with Vector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openai.embeddings_utils import get_embedding, cosine_similarity\n", + "\n", + "# Query Cosmos DB using Azure Cognitive Search\n", + "\n", + "container_name = 'text_sample'\n", + "index_name = \"text_sample_index\"\n", + "\n", + "container = database.get_container_client(container_name)\n", + "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", + "\n", + "# Perform Azure Cognitive Search query\n", + "query = 'tools for software development'\n", + "\n", + "\n", + "search_results = search_client.search( \n", + " search_text=\"\", \n", + " vector=Vector(value=get_embedding(query, k=3, engine=aoai_embedding_deployed_model), fields = \"content_vector\"), \n", + " select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"]\n", + " )\n", + "\n", + "\n", + "for result in search_results:\n", + " print(f\"Title: {result['title']}\")\n", + " print(f\"Score: {result['@search.score']}\")\n", + " print(f\"Content: {result['content']}\")\n", + " print(f\"Category: {result['category']}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Cross Search (Two columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container_name = \"text_sample\"\n", + "index_name = \"text_sample_index\"\n", + "\n", + "container = database.get_container_client(container_name)\n", + "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", + "\n", + "query = 'tools for software development'\n", + "query_vector = get_embedding(query, k=3, engine=aoai_embedding_deployed_model)\n", + "\n", + "# Perform Azure Cognitive Search query\n", + "search_results = search_client.search(\n", + " search_text=query,\n", + " select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"]\n", + ")\n", + "\n", + "# Filter results in Python based on cosine similarity\n", + "for result in search_results:\n", + " title_vector = result.get(\"title_vector\", None)\n", + " content_vector = result.get(\"content_vector\", None)\n", + "\n", + " if title_vector is not None and content_vector is not None:\n", + " title_similarity = cosine_similarity(query_vector, title_vector)\n", + " content_similarity = cosine_similarity(query_vector, content_vector)\n", + "\n", + " # Adjust the threshold as needed\n", + " if title_similarity > 0.7 or content_similarity > 0.7:\n", + " print(f\"Title: {result['title']}\")\n", + " print(f\"Score: {result['@search.score']}\")\n", + " print(f\"Content: {result['content']}\")\n", + " print(f\"Category: {result['category']}\")\n", + " print(f\"Title Cosine Similarity: {title_similarity}\")\n", + " print(f\"Content Cosine Similarity: {content_similarity}\\n\")\n", + " else:\n", + " print(f\"Skipping result with empty or missing vector.\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hybrid search + Semantic Rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "##Enable the service: https://learn.microsoft.com/en-us/azure/search/semantic-how-to-enable-disable?tabs=enable-portal\n", + "##Use the Semantic configuration defined at the Ingestion\n", + "\n", + "container_name = \"text_sample\"\n", + "index_name = \"text_sample_index\"\n", + "semantic_configuration = 'ConfigSemantictext'\n", + "\n", + "container = database.get_container_client(container_name)\n", + "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", + "\n", + "query = 'Azure DevOps is a suite of services that help you plan'\n", + "\n", + "# Perform Azure Cognitive Search query\n", + "search_results = search_client.search(\n", + " search_text=query,\n", + " select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"],\n", + " query_type=QueryType.SEMANTIC, query_language=QueryLanguage.EN_US, semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,\n", + " top=5\n", + ")\n", + "\n", + "_answers = search_results.get_answers()\n", + "for answer in _answers:\n", + " print(f\"Semantic Answer: {answer}\")\n", + " if answer.highlights:\n", + " print(f\"Semantic Answer highlight: {answer.highlights}\")\n", + " else:\n", + " print(f\"Semantic Answer Text : {answer.text}\")\n", + " print(f\"Semantic Answer Score: {answer.score}\\n\")\n", + "\n", + "\n", + "# Filter results in Python based on cosine similarity\n", + "for row in search_results:\n", + " print(f\"Title: {row['title']}\")\n", + " print(f\"Score: {row['@search.score']}\")\n", + " print(f\"Content: {row['content']}\")\n", + " print(f\"Category: {row['category']}\")\n", + "\n", + " \n", + " captions = row[\"@search.captions\"]\n", + " if captions:\n", + " caption = captions[0]\n", + " if caption.highlights:\n", + " print(f\"Caption: {caption.highlights}\\n\")\n", + " else:\n", + " print(f\"Caption: {caption.text}\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Document search example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container_name = 'doc_sample'\n", + "index_name = \"doc_sample_index\"\n", + "\n", + "container = database.get_container_client(container_name)\n", + "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", + "\n", + "# Query Cosmos DB using Azure Cognitive Search\n", + "\n", + "query = 'web hosting services'\n", + "\n", + "search_results = search_client.search( \n", + " search_text=\"\", \n", + " vector=Vector(value=get_embedding(query, k=3, engine=aoai_embedding_deployed_model), fields = \"chunk_content_vector\"), \n", + " select=[\"chunk_content\", \"chunk_content_vector\"]\n", + " )\n", + "\n", + "\n", + "for result in search_results:\n", + " print(f\"chunk_content: {result['chunk_content']}\")\n", + " print(f\"Score: {result['@search.score']}\")\n", + " print(f\"chunk_content_vector: {result['chunk_content_vector']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hybrid search + Semantic Rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "##Enable the service: https://learn.microsoft.com/en-us/azure/search/semantic-how-to-enable-disable?tabs=enable-portal\n", + "##Use the Semantic configuration defined at the Ingestion\n", + "\n", + "container_name = \"doc_sample\"\n", + "index_name = \"doc_sample_index\"\n", + "semantic_configuration = 'ConfigSemanticdoc'\n", + "\n", + "container = database.get_container_client(container_name)\n", + "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", + "\n", + "query = 'This policy applies to all Contoso Electronics employees'\n", + "# Perform Azure Cognitive Search query with semantic search\n", + "search_results = search_client.search(\n", + " search_text=query,\n", + " select=[\"chunk_content\", \"chunk_content_vector\"],\n", + " query_type=QueryType.SEMANTIC, query_language=QueryLanguage.EN_US, semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,\n", + " top=5\n", + ")\n", + "\n", + "\n", + "_answers = search_results.get_answers()\n", + "for answer in _answers:\n", + " print(f\"Semantic Answer: {answer}\")\n", + " if answer.highlights:\n", + " print(f\"Semantic Answer highlight: {answer.highlights}\")\n", + " else:\n", + " print(f\"Semantic Answer Text : {answer.text}\")\n", + " print(f\"Semantic Answer Score: {answer.score}\\n\")\n", + "\n", + "# Filter results in Python based on cosine similarity\n", + "for row in search_results:\n", + " print(f\"chunk_content: {row['chunk_content']}\")\n", + " print(f\"Score: {row['@search.score']}\")\n", + " print(f\"chunk_content_vector: {row['chunk_content_vector']}\")\n", + " \n", + " captions = row[\"@search.captions\"]\n", + " if captions:\n", + " caption = captions[0]\n", + " if caption.highlights:\n", + " print(f\"Caption: {caption.highlights}\\n\")\n", + " else:\n", + " print(f\"Caption: {caption.text}\\n\")\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "postgresql", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3070e62693f39ad7a81b9e3413b56bf9b838a3f1 Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Thu, 18 Jan 2024 09:58:08 +0000 Subject: [PATCH 02/10] Adjusting a few details --- code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb | 3 --- .../azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb | 5 ++--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb index 3856ed8..aac9d9d 100644 --- a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb +++ b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb @@ -128,7 +128,6 @@ "\n", "text_table_name = 'text_sample'\n", "doc_table_name = 'doc_sample'\n", - "image_table_name = 'image_sample'\n", "\n", "database_name = \"Vector_DB\"\n", "credential = AzureKeyCredential(str(cog_search_key))\n" @@ -226,8 +225,6 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "from azure.cosmos import CosmosClient\n", "\n", "\n", "cosmosdb_container_name = text_table_name\n", diff --git a/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb index 58077a5..b6136f1 100644 --- a/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb +++ b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb @@ -62,7 +62,7 @@ "import os\n", "from dotenv import load_dotenv\n", "\n", - "\n" + "from openai.embeddings_utils import get_embedding, cosine_similarity\n" ] }, { @@ -159,7 +159,7 @@ "metadata": {}, "outputs": [], "source": [ - "from openai.embeddings_utils import get_embedding, cosine_similarity\n", + "\n", "# Query Cosmos DB using Azure Cognitive Search\n", "\n", "container_name = 'text_sample'\n", @@ -204,7 +204,6 @@ "metadata": {}, "outputs": [], "source": [ - "from openai.embeddings_utils import get_embedding, cosine_similarity\n", "\n", "# Query Cosmos DB using Azure Cognitive Search\n", "\n", From edf8cb053561fa23e1886a3b6e266de9f209901d Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Fri, 26 Jan 2024 14:07:54 +0000 Subject: [PATCH 03/10] Finished the IAC files --- .../azure_cosmosdb_nosql/.env-template | 6 +++ code_samples/azure_cosmosdb_nosql/README.md | 6 +-- .../infrastructure/azure_cosmosdb_nosql.bicep | 48 +++++++++++++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 code_samples/azure_cosmosdb_nosql/.env-template create mode 100644 code_samples/azure_cosmosdb_nosql/infrastructure/azure_cosmosdb_nosql.bicep diff --git a/code_samples/azure_cosmosdb_nosql/.env-template b/code_samples/azure_cosmosdb_nosql/.env-template new file mode 100644 index 0000000..3d3f7fb --- /dev/null +++ b/code_samples/azure_cosmosdb_nosql/.env-template @@ -0,0 +1,6 @@ +cosmos_db_api_endpoint= +cosmos_db_api_key= +cog_search_endpoint= +cog_search_key= +cosmos_db_connection_string= +aoai_embedding_deployed_model = \ No newline at end of file diff --git a/code_samples/azure_cosmosdb_nosql/README.md b/code_samples/azure_cosmosdb_nosql/README.md index 153c7ef..d367c66 100644 --- a/code_samples/azure_cosmosdb_nosql/README.md +++ b/code_samples/azure_cosmosdb_nosql/README.md @@ -6,7 +6,7 @@ This folder includes the notebooks to demonstrate vector search capabilities of Follow the steps to run the code locally. -1. The samples uses Conda to manage virtual environments. Create a conda environment using the [azure_cosmosdb_nosql_conda.yml](./azure_cosmosdb_postgresql_conda.yml) file to include all necessary python dependencies. +1. The samples uses Conda to manage virtual environments. Create a conda environment using the [azure_cosmosdb_nosql_conda.yml](./azure_cosmosdb_nosql_conda.yml) file to include all necessary python dependencies. `conda env create -f azure_cosmosdb_nosql_conda.yml` @@ -42,8 +42,8 @@ Follow the steps to run the code locally. ## Sample Notebooks -- [cosmos_ingestion.ipynb](./cosmos_ingestion.ipynb) -- [cosmos_ingestion.ipynb](./cosmos_ingestion.ipynb) +- [azure_cosmos_ingestion.ipynb](./cosmos_ingestion.ipynb) +- [azure_cosmos_vector_query.ipynb](./cosmosdb_vector_query.ipynb) ## Reference diff --git a/code_samples/azure_cosmosdb_nosql/infrastructure/azure_cosmosdb_nosql.bicep b/code_samples/azure_cosmosdb_nosql/infrastructure/azure_cosmosdb_nosql.bicep new file mode 100644 index 0000000..c3ea542 --- /dev/null +++ b/code_samples/azure_cosmosdb_nosql/infrastructure/azure_cosmosdb_nosql.bicep @@ -0,0 +1,48 @@ +param cosmosDbAccountName string = 'Replace the name of your Account - no capital letters' +param location string = 'replace location for example - West US' +param cosmosDbKind string = 'GlobalDocumentDB' +param cosmosDbOfferType string = 'Standard' +param cosmosDbCapabilities array = [ + { + name: 'EnableServerless' + } +] +param backupIntervalInMinutes int = 240 +param backupRetentionIntervalInHours int = 8 + + +resource cosmosDb 'Microsoft.DocumentDB/databaseAccounts@2021-04-15' = { + name: cosmosDbAccountName + location: location + kind: cosmosDbKind + properties: { + databaseAccountOfferType: cosmosDbOfferType + capabilities: cosmosDbCapabilities + backupPolicy: { + type: 'Periodic' + periodicModeProperties: { + backupIntervalInMinutes: backupIntervalInMinutes + backupRetentionIntervalInHours: backupRetentionIntervalInHours + } + } + consistencyPolicy: { + defaultConsistencyLevel: 'Session' + maxIntervalInSeconds: 5 + maxStalenessPrefix: 100 + } + locations: [ + { + locationName: location + failoverPriority: 0 + } + ] + isVirtualNetworkFilterEnabled: false + enableAutomaticFailover: false + enableMultipleWriteLocations: false + disableKeyBasedMetadataWriteAccess: false + publicNetworkAccess: 'Enabled' + enableFreeTier: false + enableAnalyticalStorage: false + + } +} From 47c57144c4898171ce5914852ebccf5bf9991ae0 Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Fri, 26 Jan 2024 14:18:54 +0000 Subject: [PATCH 04/10] env parameter fle --- code_samples/azure_cosmosdb_nosql/.env-template | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code_samples/azure_cosmosdb_nosql/.env-template b/code_samples/azure_cosmosdb_nosql/.env-template index 3d3f7fb..e45f031 100644 --- a/code_samples/azure_cosmosdb_nosql/.env-template +++ b/code_samples/azure_cosmosdb_nosql/.env-template @@ -3,4 +3,5 @@ cosmos_db_api_key= cog_search_endpoint= cog_search_key= cosmos_db_connection_string= -aoai_embedding_deployed_model = \ No newline at end of file +aoai_embedding_deployed_model = +cosmos_db_connection_string = \ No newline at end of file From cc258a7b41c2c61c121b47fd3be034745ecbf47a Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Fri, 26 Jan 2024 14:22:50 +0000 Subject: [PATCH 05/10] removing some unecessary comments --- code_samples/azure_cosmosdb_nosql/.env-template | 5 ++++- code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb | 5 +---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/code_samples/azure_cosmosdb_nosql/.env-template b/code_samples/azure_cosmosdb_nosql/.env-template index e45f031..184518b 100644 --- a/code_samples/azure_cosmosdb_nosql/.env-template +++ b/code_samples/azure_cosmosdb_nosql/.env-template @@ -4,4 +4,7 @@ cog_search_endpoint= cog_search_key= cosmos_db_connection_string= aoai_embedding_deployed_model = -cosmos_db_connection_string = \ No newline at end of file +cosmos_db_connection_string = +text_table_name = +doc_table_name = +database_name = \ No newline at end of file diff --git a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb index aac9d9d..2268b79 100644 --- a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb +++ b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb @@ -129,7 +129,7 @@ "text_table_name = 'text_sample'\n", "doc_table_name = 'doc_sample'\n", "\n", - "database_name = \"Vector_DB\"\n", + "database_name = \"Vector_DB\"#####Replace here the name you want to use for your Database####\n", "credential = AzureKeyCredential(str(cog_search_key))\n" ] }, @@ -155,8 +155,6 @@ "source": [ "from azure.cosmos import CosmosClient\n", "\n", - "# Your Cosmos DB connection parameters\n", - "database_name = \"Vector_DB\" #####Replace here the name you want to use for your Database####\n", "\n", "# Initialize the Cosmos DB client\n", "client = CosmosClient(cosmos_db_api_endpoint, credential=cosmos_db_api_key)\n", @@ -190,7 +188,6 @@ "source": [ "\n", "## this adaptation, the script uses the Cosmos DB Python SDK to create items in the Cosmos DB container. \n", - "# The to_dict(orient='records') method is used to convert the Pandas DataFrame to a list of dictionaries, where each dictionary represents a record\n", "# Function to insert data into Cosmos DB\n", "def new_container(container):\n", " try:\n", From e0ff7f1c58637d727f6d211797a53184c868a4c6 Mon Sep 17 00:00:00 2001 From: LiliamLeme <62876278+LiliamLeme@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:06:06 +0000 Subject: [PATCH 06/10] Update README.md parameters are inside of the biceps --- code_samples/azure_cosmosdb_nosql/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code_samples/azure_cosmosdb_nosql/README.md b/code_samples/azure_cosmosdb_nosql/README.md index d367c66..347a6ff 100644 --- a/code_samples/azure_cosmosdb_nosql/README.md +++ b/code_samples/azure_cosmosdb_nosql/README.md @@ -22,7 +22,7 @@ Follow the steps to run the code locally. Augment the Azure Cosmos DB data with semantic and vector search capabilities of Azure AI Search.. - For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameters values in `params` section according to your environment, and run the following command. + For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameter values according to your environment, and run the following command. Container and database will be created by the ingestion python files, the biceps will only deploy an Empty Cosmos db no SQL. `az deployment group create --resource-group resource_group_name --template-file azure_cosmosdb_nosql.bicep` @@ -54,4 +54,4 @@ Follow the steps to run the code locally. - [Hybrid search overview](https://learn.microsoft.com/azure/search/hybrid-search-overview) - [Create a vector index](https://learn.microsoft.com/azure/search/vector-search-how-to-create-index) - [Query a vector index](https://learn.microsoft.com/azure/search/vector-search-how-to-query) - - [Vector search algorithms](https://learn.microsoft.com/azure/search/vector-search-ranking) \ No newline at end of file + - [Vector search algorithms](https://learn.microsoft.com/azure/search/vector-search-ranking) From b97d931fa83a375e229b7b4a5c7b37f91be1cc83 Mon Sep 17 00:00:00 2001 From: LiliamLeme <62876278+LiliamLeme@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:06:28 +0000 Subject: [PATCH 07/10] Update .env-template duplicated variable --- code_samples/azure_cosmosdb_nosql/.env-template | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/code_samples/azure_cosmosdb_nosql/.env-template b/code_samples/azure_cosmosdb_nosql/.env-template index 184518b..53bd5a6 100644 --- a/code_samples/azure_cosmosdb_nosql/.env-template +++ b/code_samples/azure_cosmosdb_nosql/.env-template @@ -4,7 +4,6 @@ cog_search_endpoint= cog_search_key= cosmos_db_connection_string= aoai_embedding_deployed_model = -cosmos_db_connection_string = text_table_name = doc_table_name = -database_name = \ No newline at end of file +database_name = From 1ea324f33823dd2e49a31fce1509cd62808cdc6b Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Fri, 26 Jan 2024 18:48:24 +0000 Subject: [PATCH 08/10] Found a problem if changes that happened in the libraries and I had to redone a part of the work --- .../azure_cosmosdb_nosql/.env-template | 3 +- code_samples/azure_cosmosdb_nosql/README.md | 2 +- .../azure_cosmosdb_nosql_conda.yml | 4 +- .../cosmos_ingestion.ipynb | 119 +++++++++--------- .../cosmosdb_vector_query.ipynb | 99 ++++++++------- 5 files changed, 118 insertions(+), 109 deletions(-) diff --git a/code_samples/azure_cosmosdb_nosql/.env-template b/code_samples/azure_cosmosdb_nosql/.env-template index 184518b..2a443aa 100644 --- a/code_samples/azure_cosmosdb_nosql/.env-template +++ b/code_samples/azure_cosmosdb_nosql/.env-template @@ -4,7 +4,8 @@ cog_search_endpoint= cog_search_key= cosmos_db_connection_string= aoai_embedding_deployed_model = -cosmos_db_connection_string = +aoai_api_version = +AZURE_OPENAI_KEY = text_table_name = doc_table_name = database_name = \ No newline at end of file diff --git a/code_samples/azure_cosmosdb_nosql/README.md b/code_samples/azure_cosmosdb_nosql/README.md index d367c66..43b3573 100644 --- a/code_samples/azure_cosmosdb_nosql/README.md +++ b/code_samples/azure_cosmosdb_nosql/README.md @@ -22,7 +22,7 @@ Follow the steps to run the code locally. Augment the Azure Cosmos DB data with semantic and vector search capabilities of Azure AI Search.. - For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameters values in `params` section according to your environment, and run the following command. + For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameter values according to your environment, and run the following command. Container and database will be created by the ingestion python files, the biceps will only deploy an Empty Cosmos db no SQL. `az deployment group create --resource-group resource_group_name --template-file azure_cosmosdb_nosql.bicep` diff --git a/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml b/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml index e62a21b..985fb16 100644 --- a/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml +++ b/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml @@ -65,7 +65,7 @@ dependencies: - matplotlib==3.8.2 - networkx==3.2.1 - numpy==1.26.2 - - openai==1.6.1 + - openai==0.28.1 - pandas==2.1.4 - python-dotenv==1.0.0 - regex==2023.12.25 @@ -73,3 +73,5 @@ dependencies: - safetensors==0.4.1 - scikit-learn==1.3.2 - scipy==1.11.4 + - azure-search-documents==11.4.0 + - plotly==5.18.0 diff --git a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb index 2268b79..a9e7295 100644 --- a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb +++ b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb @@ -5,14 +5,14 @@ "metadata": {}, "source": [ "\n", - "### Ingestion to COSMOSDB \n" + "### Ingestion to COSMOS DB No SQL\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Libraries" + "#### Pip" ] }, { @@ -22,12 +22,15 @@ "outputs": [], "source": [ "! pip install numpy\n", - "! pip install openai\n", "! pip install python-dotenv\n", "! pip install azure-core \n", "! pip install azure-cosmos\n", "! pip install tenacity\n", - "! pip install azure-search-documents\n" + "! pip install azure-search-documents --pre\n", + "! pip install pandas\n", + "! pip install \"openai==0.28.1\"\n", + "! pip install matplotlib\n", + "! pip install plotly\n" ] }, { @@ -39,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -52,9 +55,11 @@ "from azure.cosmos import exceptions, CosmosClient, PartitionKey\n", "from azure.search.documents import SearchClient\n", "from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient\n", - "from azure.search.documents.models import Vector\n", "from azure.search.documents.indexes.models import (\n", " IndexingSchedule,\n", + " HnswParameters,\n", + " SemanticSearch, \n", + " SemanticPrioritizedFields,\n", " SearchIndex,\n", " SearchIndexer,\n", " SearchIndexerDataContainer,\n", @@ -63,9 +68,7 @@ " SearchableField,\n", " SemanticConfiguration,\n", " SimpleField,\n", - " PrioritizedFields,\n", " SemanticField,\n", - " SemanticSettings,\n", " VectorSearch,\n", " VectorSearchAlgorithmConfiguration,\n", " SearchIndexerDataSourceConnection\n", @@ -73,6 +76,7 @@ "\n", "from azure.core.credentials import AzureKeyCredential\n", "from azure.core.exceptions import HttpResponseError\n", + "import pandas as pd\n", "\n", "import openai\n", "from tenacity import retry, wait_random_exponential, stop_after_attempt\n", @@ -100,6 +104,7 @@ "\n", "load_dotenv()\n", "\n", + "##Format: \"AccountEndpoint=https://Nameoftheservice.documents.azure.com;\n", "cosmos_db_api_endpoint = os.getenv(\"cosmos_db_api_endpoint\")\n", "if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == \"\":\n", " print(\"cosmos_db_api_endpoint environment variable not set.\")\n", @@ -120,6 +125,7 @@ " print(\"cog_search_key environment variable not set.\")\n", " exit()\n", "\n", + "##Format: \"AccountEndpoint=https://Nameoftheservice.documents.azure.com;AccountKey=;Database=Vector_DB ;\"\n", "cosmos_db_connection_string = os.getenv(\"cosmos_db_connection_string\")\n", "if cosmos_db_connection_string is None or cosmos_db_connection_string == \"\":\n", " print(\"cog_search_key environment variable not set.\")\n", @@ -182,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -222,7 +228,7 @@ "metadata": {}, "outputs": [], "source": [ - "\n", + "import pandas as pd\n", "\n", "cosmosdb_container_name = text_table_name\n", "container = database.get_container_client(cosmosdb_container_name)\n", @@ -241,8 +247,8 @@ " for item in records:\n", " title = item['title']\n", " content = item['content']\n", - " item['titleVector'] = item['title_vector']\n", - " item['contentVector'] = item['content_vector']\n", + " item['title_vector'] = item['title_vector']\n", + " item['content_vector'] = item['content_vector']\n", " item['@search.action'] = 'upload'\n", "\n", " # Convert the 'id' attribute to a string\n", @@ -313,12 +319,7 @@ "\n", "except Exception as e:\n", " # Handle other exceptions\n", - " print(f\"Error: {e}\")\n", - "\n", - "\n", - "\n", - "\n", - "\n" + " print(f\"Error: {e}\")\n" ] }, { @@ -368,24 +369,21 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "# https://learn.microsoft.com/en-gb/azure/search/search-create-service-portal\n", - "\n", "\n", "# Semantic search configuration\n", "config_text = SemanticConfiguration(\n", " name=\"ConfigSemantictext\",\n", - " prioritized_fields=PrioritizedFields(\n", + " prioritized_fields=SemanticPrioritizedFields (\n", " title_field=SemanticField(field_name=\"title\"),\n", - " prioritized_keywords_fields=[SemanticField(field_name=\"category\")],\n", - " prioritized_content_fields=[SemanticField(field_name=\"content\")]\n", + " keywords_fields=[SemanticField(field_name=\"category\")],\n", + " content_fields=[SemanticField(field_name=\"content\")]\n", " )\n", ")\n", "\n", "# Semantic search configuration\n", "config_doc = SemanticConfiguration(\n", " name=\"ConfigSemanticdoc\",\n", - " prioritized_fields=PrioritizedFields(\n", + " prioritized_fields=SemanticPrioritizedFields (\n", " title_field=SemanticField(field_name=\"chunk_content\")\n", " )\n", ")\n", @@ -393,59 +391,68 @@ "\n", "\n", "# Create the configurration\n", - "settings_text = SemanticSettings(configurations=[config_text])\n", - "settings_doc = SemanticSettings(configurations=[config_doc])\n", + "settings_text = SemanticSearch(configurations=[config_text])\n", + "settings_doc = SemanticSearch(configurations=[config_doc])\n", "\n", "\n", "\n", "# Vector search configuration\n", - "vector_search_config = VectorSearchAlgorithmConfiguration(\n", - " name=\"vector-cosmos-config\",\n", - " kind=\"hnsw\",\n", - " hnsw_parameters={\n", - " \"m\": 4,\n", - " \"efConstruction\": 400,\n", - " \"efSearch\": 1000,\n", - " \"metric\": \"cosine\"\n", - " }\n", + "vector_search_config = VectorSearch(\n", + " algorithm_configurations=[\n", + " VectorSearchAlgorithmConfiguration(\n", + " name=\"vector-cosmos-config\",\n", + " kind=\"hnsw\",\n", + " hnsw_parameters=HnswParameters(m=4, ef_construction=400, ef_search=500, metric=\"cosine\")\n", + " )\n", + " ]\n", ")\n", "\n", + "\n", "try:\n", - " index_client = SearchIndexClient(endpoint=cog_search_endpoint, credential=cog_search_cred)\n", + " index_client = SearchIndexClient(endpoint=cog_search_endpoint, credential=credential)\n", "\n", "\n", " # Define index fields for text_sample\n", " text_index_name = \"text_sample_index\"\n", " text_fields = [\n", " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n", - " SearchableField(name=\"title\", type=SearchFieldDataType.String,\n", - " searchable=True, retrievable=True),\n", - " SearchableField(name=\"content\", type=SearchFieldDataType.String,\n", - " searchable=True, retrievable=True),\n", - " SearchableField(name=\"category\", type=SearchFieldDataType.String,\n", - " filterable=True, searchable=True, retrievable=True),\n", - " SearchField(name=\"title_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", - " searchable=True, dimensions=1536, vector_search_configuration=\"vector-cosmos-config\"),\n", - " SearchField(name=\"content_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", - " searchable=True, dimensions=1536, vector_search_configuration=\"vector-cosmos-config\"),\n", + " SearchableField(name=\"title\", type=SearchFieldDataType.String, searchable=True, retrievable=True),\n", + " SearchableField(name=\"content\", type=SearchFieldDataType.String, searchable=True, retrievable=True),\n", + " SearchableField(name=\"category\", type=SearchFieldDataType.String, filterable=True, searchable=True, retrievable=True),\n", + " # Ensure dimensions and vectorSearchConfiguration are set for title_vector\n", + " SearchField(name=\"title_vector\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.String), # Change the type to Collection(Edm.String)\n", + " searchable=True, retrievable=True,\n", + " dimensions=1536, # Adjust dimensions as needed\n", + " vector_search_configuration=\"vector-cosmos-config\"),\n", + "\n", + " SearchField(name=\"content_vector\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.String), # Change the type to Collection(Edm.String)\n", + " searchable=True, retrievable=True,\n", + " dimensions=1536, # Adjust dimensions as needed\n", + " vector_search_configuration=\"vector-cosmos-config\"),\n", + "\n", " ]\n", "\n", "\n", "\n", + "\n", " # Define index fields for doc_sample\n", " doc_index_name = \"doc_sample_index\"\n", " doc_fields = [\n", " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n", " SearchableField(name=\"chunk_content\", type=SearchFieldDataType.String,\n", " searchable=True, retrievable=True),\n", - " SearchField(name=\"chunk_content_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", - " searchable=True, dimensions=1536, vector_search_configuration=\"vector-cosmos-config\"), \n", + " SearchField(name=\"chunk_content_vector\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.String), # Change the type to Collection(Edm.String)\n", + " searchable=True, retrievable=True,\n", + " dimensions=1536, # Adjust dimensions as needed\n", + " vector_search_configuration=\"vector-cosmos-config\"),\n", + " \n", " ]\n", "\n", "\n", - "\n", - "\n", - " \n", + " \n", "\n", " \n", " # Create indexes\n", @@ -485,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -516,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -554,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -598,7 +605,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.1" }, "orig_nbformat": 4 }, diff --git a/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb index b6136f1..1b52d92 100644 --- a/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb +++ b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -46,23 +46,21 @@ " SearchableField,\n", " SemanticConfiguration,\n", " SimpleField,\n", - " PrioritizedFields,\n", " SemanticField,\n", - " SemanticSettings,\n", " VectorSearch,\n", - " VectorSearchAlgorithmConfiguration,\n", - " SearchIndexerDataSourceConnection,\n", + " SearchIndexerDataSourceConnection,\n", " \n", ")\n", "\n", - "import openai\n", + "\n", "from tenacity import retry, wait_random_exponential, stop_after_attempt\n", "\n", "\n", "import os\n", "from dotenv import load_dotenv\n", "\n", - "from openai.embeddings_utils import get_embedding, cosine_similarity\n" + "import openai\n", + "from openai.embeddings_utils import cosine_similarity\n" ] }, { @@ -110,7 +108,14 @@ "if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == \"\":\n", " print(\"aoai_embedding_deployed_model environment variable not set.\")\n", " exit()\n", - " \n", + "azure_openai_key = os.getenv(\"AZURE_OPENAI_KEY\")\n", + "if azure_openai_key is None or azure_openai_key == \"\":\n", + " print(\"azure_openai_key environment variable not set.\")\n", + " exit()\n", + "aoai_api_version = os.getenv(\"AOAI_API_VERSION\")\n", + "if aoai_api_version is None or aoai_api_version == \"\":\n", + " print(\"aoai_api_version environment variable not set.\")\n", + " exit() \n", "\n", "text_table_name = 'text_sample'\n", "doc_table_name = 'doc_sample'\n", @@ -120,6 +125,27 @@ "credential = AzureKeyCredential(str(cog_search_key))\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Open AI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "from openai.embeddings_utils import cosine_similarity\n", + "\n", + "\n", + "openai.api_key = azure_openai_key\n", + "openai.api_version = aoai_api_version\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -129,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -142,53 +168,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Simple Vector Search\n", - "##### You will use the index name that you created previously on the ingestion steps for the respectively containers\n" + "##### Embedding Function" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 39, "metadata": {}, + "outputs": [], "source": [ - "#### Using Simple Search with Cosine similarity" + "def get_embedding(text, model=aoai_embedding_deployed_model):\n", + " text = text.replace(\"\\n\", \" \")\n", + " return openai.embeddings.create(input = [text], model=model).data[0].embedding" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "\n", - "# Query Cosmos DB using Azure Cognitive Search\n", - "\n", - "container_name = 'text_sample'\n", - "index_name = \"text_sample_index\"\n", - "\n", - "container = database.get_container_client(container_name)\n", - "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", - "\n", - "\n", - "query = 'tools for software development'\n", - "query_vector = get_embedding(query, k=3, engine=aoai_embedding_deployed_model)\n", - "\n", - "# Perform Azure Cognitive Search query\n", - "search_results = search_client.search(search_text=query, select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"])\n", - "\n", - "\n", - "for result in search_results:\n", - " result_vector = result.get(\"content_vector\", None)\n", - "\n", - " if result_vector is not None and len(result_vector) > 0:\n", - " similarity_score = cosine_similarity(query_vector, result_vector)\n", - "\n", - " print(f\"Title: {result['title']}\")\n", - " print(f\"Score: {result['@search.score']}\")\n", - " print(f\"Content: {result['content']}\")\n", - " print(f\"Category: {result['category']}\")\n", - " print(f\"Cosine Similarity: {similarity_score}\\n\")\n", - " else:\n", - " print(f\"Skipping result with empty or missing vector.\\n\")" + "### Simple Vector Search\n", + "##### You will use the index name that you created previously on the ingestion steps for the respectively containers\n" ] }, { @@ -219,7 +218,7 @@ "\n", "search_results = search_client.search( \n", " search_text=\"\", \n", - " vector=Vector(value=get_embedding(query, k=3, engine=aoai_embedding_deployed_model), fields = \"content_vector\"), \n", + " vector=Vector(value=get_embedding(query, model=aoai_embedding_deployed_model), fields = \"content_vector\"), \n", " select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"]\n", " )\n", "\n", @@ -251,7 +250,7 @@ "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", "\n", "query = 'tools for software development'\n", - "query_vector = get_embedding(query, k=3, engine=aoai_embedding_deployed_model)\n", + "query_vector = get_embedding(query, model=aoai_embedding_deployed_model)\n", "\n", "# Perform Azure Cognitive Search query\n", "search_results = search_client.search(\n", @@ -365,7 +364,7 @@ "\n", "search_results = search_client.search( \n", " search_text=\"\", \n", - " vector=Vector(value=get_embedding(query, k=3, engine=aoai_embedding_deployed_model), fields = \"chunk_content_vector\"), \n", + " vector=Vector(value=get_embedding(query, engine=aoai_embedding_deployed_model), fields = \"chunk_content_vector\"), \n", " select=[\"chunk_content\", \"chunk_content_vector\"]\n", " )\n", "\n", @@ -452,7 +451,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.1" }, "orig_nbformat": 4 }, From 74223ea89d9d5c8f2bb94eead9902d7b5d5f08c3 Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Fri, 26 Jan 2024 18:54:46 +0000 Subject: [PATCH 09/10] variables adjusts --- code_samples/azure_cosmosdb_nosql/.env-template | 2 ++ code_samples/azure_cosmosdb_nosql/README.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/code_samples/azure_cosmosdb_nosql/.env-template b/code_samples/azure_cosmosdb_nosql/.env-template index 66818ce..0c18755 100644 --- a/code_samples/azure_cosmosdb_nosql/.env-template +++ b/code_samples/azure_cosmosdb_nosql/.env-template @@ -8,3 +8,5 @@ cosmos_db_connection_string = text_table_name = doc_table_name = database_name = +azure_openai_key = +openai.api_version = \ No newline at end of file diff --git a/code_samples/azure_cosmosdb_nosql/README.md b/code_samples/azure_cosmosdb_nosql/README.md index d869574..16f42b3 100644 --- a/code_samples/azure_cosmosdb_nosql/README.md +++ b/code_samples/azure_cosmosdb_nosql/README.md @@ -22,7 +22,7 @@ Follow the steps to run the code locally. Augment the Azure Cosmos DB data with semantic and vector search capabilities of Azure AI Search.. - For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameters values in `params` section according to your environment, and run the following command. + For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameters values according to your environment, and run the following command. It will deploy an empty Cosmos Db No SQL, containers will be created in the ingestion step. `az deployment group create --resource-group resource_group_name --template-file azure_cosmosdb_nosql.bicep` From c95fa3de39a79742c610e5963db08b2d1b4c1996 Mon Sep 17 00:00:00 2001 From: LiliamLeme Date: Wed, 31 Jan 2024 11:35:29 +0000 Subject: [PATCH 10/10] Merge branch 'leme_nosqlcosmos' of https://github.com/LiliamLeme/azure-vector-database-samples into leme_nosqlcosmos --- .../azure_cosmosdb_nosql/.env-template | 13 +- code_samples/azure_cosmosdb_nosql/README.md | 25 +- .../azure_cosmosdb_nosql_conda.yml | 240 ++++++++++++----- .../cosmos_ingestion.ipynb | 244 +++++++++++------- .../cosmosdb_vector_query.ipynb | 162 ++++++++---- .../azure_cosmosdb_nosql/requirements.txt | Bin 0 -> 1580 bytes 6 files changed, 455 insertions(+), 229 deletions(-) create mode 100644 code_samples/azure_cosmosdb_nosql/requirements.txt diff --git a/code_samples/azure_cosmosdb_nosql/.env-template b/code_samples/azure_cosmosdb_nosql/.env-template index 0c18755..9d67711 100644 --- a/code_samples/azure_cosmosdb_nosql/.env-template +++ b/code_samples/azure_cosmosdb_nosql/.env-template @@ -1,12 +1,9 @@ cosmos_db_api_endpoint= cosmos_db_api_key= +cosmos_db_connection_string= cog_search_endpoint= cog_search_key= -cosmos_db_connection_string= -aoai_embedding_deployed_model = -cosmos_db_connection_string = -text_table_name = -doc_table_name = -database_name = -azure_openai_key = -openai.api_version = \ No newline at end of file +AOAI_ENDPOINT= +AOAI_API_VERSION = +AOAI_EMBEDDING_DEPLOYED_MODEL= +AZURE_OPENAI_KEY= diff --git a/code_samples/azure_cosmosdb_nosql/README.md b/code_samples/azure_cosmosdb_nosql/README.md index 16f42b3..0f05770 100644 --- a/code_samples/azure_cosmosdb_nosql/README.md +++ b/code_samples/azure_cosmosdb_nosql/README.md @@ -8,12 +8,25 @@ Follow the steps to run the code locally. 1. The samples uses Conda to manage virtual environments. Create a conda environment using the [azure_cosmosdb_nosql_conda.yml](./azure_cosmosdb_nosql_conda.yml) file to include all necessary python dependencies. - `conda env create -f azure_cosmosdb_nosql_conda.yml` + `conda env create -f cosmostest azure_cosmosdb_nosql_conda.yml` + + + + **Alternatively** + + a. You could install the [requirements.txt](./requirements.txt) in your environment **instead** the yml. + + `pip install -r /path/to/requirements.txt` + + b. Or run the pip install libraries from the ingestion sample script- [azure_cosmos_ingestion.ipynb](./cosmos_ingestion.ipynb). + + 2. Create a *.env* file from the *.env-template* and populate it with all necessary keys. 3. Finally, follow the instructions mentioned here to run the code locally using VS Code - [Run the Code Locally](../README.md#run-the-code-locally) + ## Resources Deployment - Azure CosmosDb @@ -22,7 +35,8 @@ Follow the steps to run the code locally. Augment the Azure Cosmos DB data with semantic and vector search capabilities of Azure AI Search.. - For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, fill out the parameters values according to your environment, and run the following command. It will deploy an empty Cosmos Db No SQL, containers will be created in the ingestion step. + For IAC deployment, **[infrastructure](./infrastructure/)** folder has a bicep script to deploy the Azure CosmosDb. In the bicep script, **fill out the parameters** values according to your environment, and run the following command. + Note: It will deploy an empty Cosmos Db No SQL, containers will be created in the ingestion step. `az deployment group create --resource-group resource_group_name --template-file azure_cosmosdb_nosql.bicep` @@ -32,6 +46,10 @@ Follow the steps to run the code locally. Azure OpenAI Service resource can be deployed using [Azure Portal](https://learn.microsoft.com/azure/ai-services/openai/how-to/create-resource?pivots=web-portal), [Azure CLI](https://learn.microsoft.com/azure/ai-services/openai/how-to/create-resource?pivots=cli) or [Azure PowerShell](https://learn.microsoft.com/azure/ai-services/openai/how-to/create-resource?pivots=ps). Again, [private endpoints](https://learn.microsoft.com/azure/ai-services/cognitive-services-virtual-networks?context=%2Fazure%2Fai-services%2Fopenai%2Fcontext%2Fcontext&tabs=portal#use-private-endpoints) can be used for Azure AI services resources to allow clients on a virtual network to securely access data over Azure Private Link. + Please note, for the semantic Search you need to enable the Service: [Semantic](https://learn.microsoft.com/en-us/azure/search/semantic-how-to-enable-disable?tabs=enable-portal) + + **In summary:** You will need to have an Open AI Service created with the Model deployed from it( for example:text-embedding-ada-002), and also a Cognitive Search Service created with the Semantic Search enable. + ## Datasets - [text](../data/text/) - for text search sample @@ -55,3 +73,6 @@ Follow the steps to run the code locally. - [Create a vector index](https://learn.microsoft.com/azure/search/vector-search-how-to-create-index) - [Query a vector index](https://learn.microsoft.com/azure/search/vector-search-how-to-query) - [Vector search algorithms](https://learn.microsoft.com/azure/search/vector-search-ranking) + - [Create a Service](https://learn.microsoft.com/en-us/azure/search/search-create-service-portal) + - [Vector Store](https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-create-index?tabs=config-2023-11-01%2Crest-2023-11-01%2Cpush%2Cportal-check-index) + - [Deploy Models](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#embeddings-models) diff --git a/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml b/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml index 985fb16..b2d38c2 100644 --- a/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml +++ b/code_samples/azure_cosmosdb_nosql/azure_cosmosdb_nosql_conda.yml @@ -3,75 +3,185 @@ channels: - conda-forge - defaults dependencies: - - blas=1.0=mkl - - brotli-python=1.0.9=py312hd77b12b_7 + - aiohttp=3.8.5=py310h8d17308_0 + - aiosignal=1.3.1=pyhd8ed1ab_0 + - anyio=3.7.1=pyhd8ed1ab_0 + - appdirs=1.4.4=pyh9f0ad1d_0 + - asttokens=2.4.0=pyhd8ed1ab_0 + - async-timeout=4.0.3=pyhd8ed1ab_0 + - attrs=23.1.0=pyh71513ae_1 + - backcall=0.2.0=pyhd3eb1b0_0 + - backports=1.1=pyhd3eb1b0_0 + - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0 + - brotli=1.1.0=hcfcfb64_0 + - brotli-bin=1.1.0=hcfcfb64_0 + - brotli-python=1.1.0=py310h00ffb61_0 - bzip2=1.0.8=he774522_0 - ca-certificates=2023.12.12=haa95532_0 - - certifi=2023.11.17=py312haa95532_0 - - cffi=1.16.0=py312h2bbff1b_0 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - click=8.1.7=py312haa95532_0 - - colorama=0.4.6=py312haa95532_0 - - cryptography=41.0.7=py312h89fc84f_0 - - expat=2.5.0=hd77b12b_0 - - idna=3.4=py312haa95532_0 - - intel-openmp=2023.1.0=h59b6b97_46320 + - cachetools=5.3.1=pyhd8ed1ab_0 + - certifi=2023.11.17=py310haa95532_0 + - cffi=1.15.1=py310h8d17308_5 + - charset-normalizer=3.2.0=pyhd8ed1ab_0 + - click=8.1.7=win_pyh7428d3b_0 + - colorama=0.4.6=py310haa95532_0 + - comm=0.1.4=pyhd8ed1ab_0 + - contourpy=1.1.1=py310h232114e_1 + - cryptography=41.0.4=py310h6e82f81_0 + - cycler=0.11.0=pyhd8ed1ab_0 + - dataclasses-json=0.5.7=pyhd8ed1ab_0 + - debugpy=1.8.0=py310h00ffb61_1 + - decorator=5.1.1=pyhd3eb1b0_0 + - docker-pycreds=0.4.0=py_0 + - et_xmlfile=1.1.0=pyhd8ed1ab_0 + - exceptiongroup=1.2.0=py310haa95532_0 + - executing=1.2.0=pyhd8ed1ab_0 + - fonttools=4.42.1=py310h8d17308_0 + - freetype=2.12.1=hdaf720e_2 + - frozenlist=1.4.0=py310h8d17308_1 + - gitdb=4.0.10=pyhd8ed1ab_0 + - gitpython=3.1.37=pyhd8ed1ab_0 + - greenlet=2.0.2=py310h00ffb61_1 + - idna=3.4=pyhd8ed1ab_0 + - importlib-metadata=7.0.1=py310haa95532_0 + - importlib_metadata=7.0.1=hd3eb1b0_0 + - intel-openmp=2023.2.0=h57928b3_49503 + - ipykernel=6.28.0=py310haa95532_0 + - ipython=8.20.0=py310haa95532_0 + - jedi=0.19.0=pyhd8ed1ab_0 + - joblib=1.3.2=pyhd8ed1ab_0 + - jsonpatch=1.33=pyhd8ed1ab_0 + - jsonpointer=2.4=py310h5588dad_3 + - jupyter_client=8.6.0=py310haa95532_0 + - jupyter_core=5.5.0=py310haa95532_0 + - kiwisolver=1.4.5=py310h232114e_1 + - langchain=0.0.304=pyhd8ed1ab_0 + - langsmith=0.0.41=pyhd8ed1ab_0 + - lcms2=2.15=he9d350c_2 + - lerc=4.0.0=h63175ca_0 + - libabseil=20230802.1=cxx17_h63175ca_0 + - libblas=3.9.0=18_win64_mkl + - libbrotlicommon=1.1.0=hcfcfb64_0 + - libbrotlidec=1.1.0=hcfcfb64_0 + - libbrotlienc=1.1.0=hcfcfb64_0 + - libcblas=3.9.0=18_win64_mkl + - libdeflate=1.19=hcfcfb64_0 - libffi=3.4.4=hd77b12b_0 - - mkl=2023.1.0=h6b88ed4_46358 - - mkl-service=2.4.0=py312h2bbff1b_1 - - mkl_fft=1.3.8=py312h2bbff1b_0 - - mkl_random=1.2.4=py312h59b6b97_0 - - numpy=1.26.3=py312hfd52020_0 - - numpy-base=1.26.3=py312h4dde369_0 - - openssl=3.0.12=h2bbff1b_0 - - pip=23.3.1=py312haa95532_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=23.2.0=py312haa95532_0 - - pysocks=1.7.1=py312haa95532_0 - - python=3.12.0=h1d929f7_0 - - python-dotenv=0.21.0=py312haa95532_0 - - requests=2.31.0=py312haa95532_0 - - setuptools=68.2.2=py312haa95532_0 + - libhwloc=2.9.3=default_haede6df_1009 + - libiconv=1.17=h8ffe710_0 + - libjpeg-turbo=2.1.5.1=hcfcfb64_1 + - liblapack=3.9.0=18_win64_mkl + - libpng=1.6.39=h19919ed_0 + - libprotobuf=4.24.3=hb8276f3_0 + - libsodium=1.0.18=h62dcd97_0 + - libsqlite=3.44.2=hcfcfb64_0 + - libtiff=4.6.0=h4554b19_1 + - libwebp-base=1.3.2=hcfcfb64_0 + - libxcb=1.15=hcd874cb_0 + - libxml2=2.11.5=hc3477c8_1 + - libzlib=1.2.13=hcfcfb64_5 + - m2w64-gcc-libgfortran=5.3.0=6 + - m2w64-gcc-libs=5.3.0=7 + - m2w64-gcc-libs-core=5.3.0=7 + - m2w64-gmp=6.1.0=2 + - m2w64-libwinpthread-git=5.0.0.4634.697f757=2 + - matplotlib-base=3.8.0=py310hc9baf74_1 + - matplotlib-inline=0.1.6=py310haa95532_0 + - mkl=2022.1.0=h6a75c08_874 + - msys2-conda-epoch=20160418=1 + - multidict=6.0.4=py310h8d17308_0 + - munkres=1.1.4=pyh9f0ad1d_0 + - mypy_extensions=1.0.0=pyha770c72_0 + - nest-asyncio=1.5.6=py310haa95532_0 + - numexpr=2.8.7=mkl_py310hd551296_0 + - numpy=1.26.0=py310hf667824_0 + - openai=0.28.1=pyhd8ed1ab_0 + - openapi-schema-pydantic=1.2.4=pyhd8ed1ab_0 + - openjpeg=2.5.0=h3d672ee_3 + - openpyxl=3.1.2=py310h8d17308_1 + - openssl=3.1.3=hcfcfb64_0 + - packaging=23.1=py310haa95532_0 + - pandas=2.1.1=py310hecd3228_0 + - pandas-stubs=2.0.3.230814=pyhd8ed1ab_0 + - parso=0.8.3=pyhd3eb1b0_0 + - pathtools=0.1.2=py_1 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=10.0.1=py310h6abe1ea_1 + - pip=23.3.1=py310haa95532_0 + - platformdirs=3.10.0=py310haa95532_0 + - plotly=5.17.0=pyhd8ed1ab_0 + - prompt-toolkit=3.0.43=py310haa95532_0 + - prompt_toolkit=3.0.43=hd3eb1b0_0 + - protobuf=4.24.3=py310h19be30a_0 + - psutil=5.9.5=py310h8d17308_1 + - pthread-stubs=0.4=hcd874cb_1001 + - pthreads-win32=2.9.1=hfa6e2cd_3 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - pyasn1=0.5.0=pyhd8ed1ab_0 + - pyasn1-modules=0.3.0=pyhd8ed1ab_0 + - pycparser=2.21=pyhd8ed1ab_0 + - pydantic=1.10.13=py310h8d17308_0 + - pygments=2.16.1=pyhd8ed1ab_0 + - pyopenssl=23.2.0=pyhd8ed1ab_1 + - pyparsing=3.1.1=pyhd8ed1ab_0 + - pypdf2=2.11.1=pyhd8ed1ab_0 + - pysocks=1.7.1=pyh0701188_6 + - python=3.10.13=he1021f5_0 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python-dotenv=1.0.0=pyhd8ed1ab_1 + - python-tzdata=2023.3=pyhd8ed1ab_0 + - python_abi=3.10=2_cp310 + - pytz=2023.3.post1=pyhd8ed1ab_0 + - pyu2f=0.1.5=pyhd8ed1ab_0 + - pywin32=306=py310h00ffb61_1 + - pyyaml=6.0.1=py310h8d17308_1 + - pyzmq=25.1.2=py310hd77b12b_0 + - requests=2.31.0=pyhd8ed1ab_0 + - rsa=4.9=pyhd8ed1ab_0 + - scikit-learn=1.3.1=py310hfd2573f_0 + - scipy=1.11.2=py310h70e3499_1 + - sentry-sdk=1.31.0=pyhd8ed1ab_0 + - setproctitle=1.3.2=py310h8d17308_2 + - setuptools=68.2.2=py310haa95532_0 + - six=1.16.0=pyhd3eb1b0_1 + - smmap=3.0.5=pyh44b312d_0 + - sniffio=1.3.0=pyhd8ed1ab_0 + - sqlalchemy=2.0.21=py310h8d17308_0 - sqlite=3.41.2=h2bbff1b_0 - - tbb=2021.8.0=h59b6b97_0 - - tk=8.6.12=h2bbff1b_0 + - stack_data=0.6.2=pyhd8ed1ab_0 + - stringcase=1.2.0=py_0 + - tbb=2021.10.0=h91493d7_1 + - tenacity=8.2.3=pyhd8ed1ab_0 + - threadpoolctl=3.2.0=pyha21a80b_0 + - tk=8.6.13=hcfcfb64_0 + - tornado=6.3.3=py310h2bbff1b_0 + - tqdm=4.66.1=pyhd8ed1ab_0 + - traitlets=5.10.1=pyhd8ed1ab_0 + - types-pytz=2023.3.1.1=pyhd8ed1ab_0 + - typing-extensions=4.9.0=py310haa95532_1 + - typing_extensions=4.9.0=py310haa95532_1 + - typing_inspect=0.9.0=pyhd8ed1ab_0 - tzdata=2023d=h04d1e81_0 - - urllib3=1.26.18=py312haa95532_0 - - vc=14.2=h21ff451_1 - - vs2015_runtime=14.27.29016=h5e58377_2 - - wheel=0.41.2=py312haa95532_0 - - win_inet_pton=1.1.0=py312haa95532_0 + - ucrt=10.0.22621.0=h57928b3_0 + - unicodedata2=15.1.0=py310h8d17308_0 + - urllib3=2.0.5=pyhd8ed1ab_0 + - vc=14.3=h64f974e_17 + - vc14_runtime=14.38.33130=h82b7239_18 + - vs2015_runtime=14.38.33130=hcb4865c_18 + - wandb=0.15.11=pyhd8ed1ab_0 + - wcwidth=0.2.6=pyhd8ed1ab_0 + - wheel=0.41.2=py310haa95532_0 + - win_inet_pton=1.1.0=pyhd8ed1ab_6 + - xorg-libxau=1.0.11=hcd874cb_0 + - xorg-libxdmcp=1.1.3=hcd874cb_0 - xz=5.4.5=h8cc25b3_0 - - zlib=1.2.13=h8cc25b3_0 + - yaml=0.2.5=h8ffe710_2 + - yarl=1.9.2=py310h8d17308_0 + - zeromq=4.3.5=hd77b12b_0 + - zipp=3.17.0=py310haa95532_0 + - zlib=1.2.13=hcfcfb64_5 + - zstd=1.5.5=h12be248_0 - pip: - - annotated-types==0.6.0 - - anyio==4.2.0 - - certifi==2023.11.17 - - charset-normalizer==3.3.2 - - click==8.1.7 - - contourpy==1.2.0 - - cycler==0.12.1 - - distro==1.9.0 - - filelock==3.13.1 - - fonttools==4.47.0 - - fsspec==2023.12.2 - - h11==0.14.0 - - httpcore==1.0.2 - - httpx==0.26.0 - - idna==3.6 - - jinja2==3.1.2 - - joblib==1.3.2 - - markupsafe==2.1.3 - - matplotlib==3.8.2 - - networkx==3.2.1 - - numpy==1.26.2 - - openai==0.28.1 - - pandas==2.1.4 - - python-dotenv==1.0.0 - - regex==2023.12.25 - - requests==2.31.0 - - safetensors==0.4.1 - - scikit-learn==1.3.2 - - scipy==1.11.4 - - azure-search-documents==11.4.0 - - plotly==5.18.0 + - azure-common==1.1.28 + - azure-core==1.29.4 + - azure-search-documents==11.4.0b9 + - isodate==0.6.1 diff --git a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb index a9e7295..42b86fa 100644 --- a/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb +++ b/code_samples/azure_cosmosdb_nosql/cosmos_ingestion.ipynb @@ -5,14 +5,15 @@ "metadata": {}, "source": [ "\n", - "### Ingestion to COSMOS DB No SQL\n" + "## Ingestion to Azure Cosmos DB NoSQL\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Pip" + "#### Pip \n", + "##### (Optional: As you can use requirements.txt or Yml if you prefer, instead the pip install)" ] }, { @@ -21,16 +22,21 @@ "metadata": {}, "outputs": [], "source": [ + "##You can also use the requirements.txt or yml file.\n", "! pip install numpy\n", "! pip install python-dotenv\n", "! pip install azure-core \n", "! pip install azure-cosmos\n", "! pip install tenacity\n", - "! pip install azure-search-documents --pre\n", + "! pip install azure-search-documents===11.4.0\n", "! pip install pandas\n", - "! pip install \"openai==0.28.1\"\n", + "! pip install openai==0.28.1\n", "! pip install matplotlib\n", - "! pip install plotly\n" + "! pip install plotly\n", + "! pip install plotly\n", + "! pip install scikit-learn\n", + "! pip install scipy\n", + "! pip install Pyarrow " ] }, { @@ -42,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +63,6 @@ "from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient\n", "from azure.search.documents.indexes.models import (\n", " IndexingSchedule,\n", - " HnswParameters,\n", " SemanticSearch, \n", " SemanticPrioritizedFields,\n", " SearchIndex,\n", @@ -71,7 +76,14 @@ " SemanticField,\n", " VectorSearch,\n", " VectorSearchAlgorithmConfiguration,\n", - " SearchIndexerDataSourceConnection\n", + " SearchIndexerDataSourceConnection,\n", + " VectorSearchAlgorithmKind,\n", + " VectorSearchAlgorithmMetric,\n", + " HnswAlgorithmConfiguration,\n", + " HnswParameters,\n", + " VectorSearchProfile,\n", + " ExhaustiveKnnAlgorithmConfiguration,\n", + " ExhaustiveKnnParameters\n", ")\n", "\n", "from azure.core.credentials import AzureKeyCredential\n", @@ -104,39 +116,70 @@ "\n", "load_dotenv()\n", "\n", - "##Format: \"AccountEndpoint=https://Nameoftheservice.documents.azure.com;\n", + "## Cosmos db endpoint format: https://.documents.azure.com\n", "cosmos_db_api_endpoint = os.getenv(\"cosmos_db_api_endpoint\")\n", "if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == \"\":\n", " print(\"cosmos_db_api_endpoint environment variable not set.\")\n", " exit()\n", "\n", + "## Cosmos db API Key\n", "cosmos_db_api_key = os.getenv(\"cosmos_db_api_key\")\n", "if cosmos_db_api_key is None or cosmos_db_api_key == \"\":\n", " print(\"cosmos_db_api_key environment variable not set.\")\n", " exit()\n", "\n", + "##Cosmos Connection String. Format: \n", + "##AccountEndpoint=https://.documents.azure.com;AccountKey=;Database=;\n", + "cosmos_db_connection_string = os.getenv(\"cosmos_db_connection_string\")\n", + "if cosmos_db_connection_string is None or cosmos_db_connection_string == \"\":\n", + " print(\"cosmos_db_connection_string environment variable not set.\")\n", + " exit()\n", + " \n", + "##Cognitive Search Service Name, you need to deploy this service. Format: https://.search.windows.net\n", "cog_search_endpoint = os.getenv(\"cog_search_endpoint\")\n", "if cog_search_endpoint is None or cog_search_endpoint == \"\":\n", " print(\"cog_search_endpoint environment variable not set.\")\n", " exit()\n", "\n", + "##Cognitive Search Service Key\n", "cog_search_key = os.getenv(\"cog_search_key\")\n", "if cog_search_key is None or cog_search_key == \"\":\n", " print(\"cog_search_key environment variable not set.\")\n", " exit()\n", "\n", - "##Format: \"AccountEndpoint=https://Nameoftheservice.documents.azure.com;AccountKey=;Database=Vector_DB ;\"\n", - "cosmos_db_connection_string = os.getenv(\"cosmos_db_connection_string\")\n", - "if cosmos_db_connection_string is None or cosmos_db_connection_string == \"\":\n", - " print(\"cog_search_key environment variable not set.\")\n", - " exit()\n", " \n", + "##Open AI Service. This must be deployed. Format:https://nameoftheservice.azure.com/ \n", + "aoai_endpoint = os.getenv(\"AOAI_ENDPOINT\") ##api_base \n", + "if aoai_endpoint is None or aoai_endpoint == \"\":\n", + " print(\"AOAI_ENDPOINT environment variable not set.\")\n", + " exit()\n", + "\n", + "##Version of the Open AI Service. This was build with the \"2023-05-15\" version\n", + "aoai_api_version = os.getenv(\"AOAI_API_VERSION\")\n", + "if aoai_api_version is None or aoai_api_version == \"\":\n", + " print(\"AOAI_API_VERSION environment variable not set.\")\n", + " exit()\n", + "\n", + "##Model of the Open AI Service. This must be deployed: \"text-embedding-ada-002\"\n", + "aoai_embedding_deployed_model = os.getenv(\"AOAI_EMBEDDING_DEPLOYED_MODEL\")\n", + "if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == \"\":\n", + " print(\"AOAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.\")\n", + " exit()\n", + "\n", + "##Open AI ServikeyKeyce.\n", + "azure_openai_key = os.getenv(\"AZURE_OPENAI_KEY\")\n", + "if azure_openai_key is None or azure_openai_key == \"\":\n", + " print(\"AZURE_OPENAI_KEY environment variable not set.\")\n", + " exit()\n", "\n", + "##Container names for the CosmosDB\n", "text_table_name = 'text_sample'\n", "doc_table_name = 'doc_sample'\n", + "image_table_name = 'image_sample'\n", "\n", - "database_name = \"Vector_DB\"#####Replace here the name you want to use for your Database####\n", - "credential = AzureKeyCredential(str(cog_search_key))\n" + "database_name = \"Vector_DB\"\n", + "credential = AzureKeyCredential(str(cog_search_key))\n", + "openai.api_type = \"azure\"\n" ] }, { @@ -183,12 +226,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Function for new Container" + "##### Function for the New Container" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -264,7 +307,6 @@ " print(f\"Document {container} with ID {item['id']} already exists...\")\n", " print(f\"Error: {e}\")\n", "\n", - " # Implement your logic to update the existing document or take appropriate action\n", "\n", "except Exception as e:\n", " # Handle other exceptions\n", @@ -315,8 +357,6 @@ " print(f\"Document {container} with ID {item['id']} already exists...\")\n", " print(f\"Error: {e}\")\n", "\n", - " # Implement your logic to update the existing document or take appropriate action\n", - "\n", "except Exception as e:\n", " # Handle other exceptions\n", " print(f\"Error: {e}\")\n" @@ -370,42 +410,39 @@ "outputs": [], "source": [ "\n", - "# Semantic search configuration\n", - "config_text = SemanticConfiguration(\n", - " name=\"ConfigSemantictext\",\n", - " prioritized_fields=SemanticPrioritizedFields (\n", - " title_field=SemanticField(field_name=\"title\"),\n", - " keywords_fields=[SemanticField(field_name=\"category\")],\n", - " content_fields=[SemanticField(field_name=\"content\")]\n", + "# Vector search configuration\n", + "##adding profiles as there is a change in this library. Note using Azure Search documents 11.4.0\n", + "vector_search = VectorSearch(\n", + " algorithms=[\n", + " HnswAlgorithmConfiguration(\n", + " name=\"hsnw_config\",\n", + " kind=VectorSearchAlgorithmKind.HNSW,\n", + " parameters=HnswParameters(\n", + " m=4,\n", + " ef_construction=400,\n", + " ef_search=500,\n", + " metric=VectorSearchAlgorithmMetric.COSINE\n", " )\n", - ")\n", - "\n", - "# Semantic search configuration\n", - "config_doc = SemanticConfiguration(\n", - " name=\"ConfigSemanticdoc\",\n", - " prioritized_fields=SemanticPrioritizedFields (\n", - " title_field=SemanticField(field_name=\"chunk_content\")\n", + " ),\n", + " ExhaustiveKnnAlgorithmConfiguration(\n", + " name=\"ExhaustiveKnn\",\n", + " kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,\n", + " parameters=ExhaustiveKnnParameters(\n", + " metric=VectorSearchAlgorithmMetric.COSINE\n", + " )\n", + " )\n", + " ],\n", + " profiles=[\n", + " VectorSearchProfile(\n", + " name=\"HnswProfile\",\n", + " algorithm_configuration_name=\"hsnw_config\",\n", + " ),\n", + " VectorSearchProfile(\n", + " name=\"myExhaustiveKnnProfile\",\n", + " algorithm_configuration_name=\"ExhaustiveKnn\",\n", + " )\n", + " ]\n", " )\n", - ")\n", - "\n", - "\n", - "\n", - "# Create the configurration\n", - "settings_text = SemanticSearch(configurations=[config_text])\n", - "settings_doc = SemanticSearch(configurations=[config_doc])\n", - "\n", - "\n", - "\n", - "# Vector search configuration\n", - "vector_search_config = VectorSearch(\n", - " algorithm_configurations=[\n", - " VectorSearchAlgorithmConfiguration(\n", - " name=\"vector-cosmos-config\",\n", - " kind=\"hnsw\",\n", - " hnsw_parameters=HnswParameters(m=4, ef_construction=400, ef_search=500, metric=\"cosine\")\n", - " )\n", - " ]\n", - ")\n", "\n", "\n", "try:\n", @@ -415,69 +452,81 @@ " # Define index fields for text_sample\n", " text_index_name = \"text_sample_index\"\n", " text_fields = [\n", - " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n", - " SearchableField(name=\"title\", type=SearchFieldDataType.String, searchable=True, retrievable=True),\n", - " SearchableField(name=\"content\", type=SearchFieldDataType.String, searchable=True, retrievable=True),\n", - " SearchableField(name=\"category\", type=SearchFieldDataType.String, filterable=True, searchable=True, retrievable=True),\n", - " # Ensure dimensions and vectorSearchConfiguration are set for title_vector\n", - " SearchField(name=\"title_vector\",\n", - " type=SearchFieldDataType.Collection(SearchFieldDataType.String), # Change the type to Collection(Edm.String)\n", - " searchable=True, retrievable=True,\n", - " dimensions=1536, # Adjust dimensions as needed\n", - " vector_search_configuration=\"vector-cosmos-config\"),\n", - "\n", - " SearchField(name=\"content_vector\",\n", - " type=SearchFieldDataType.Collection(SearchFieldDataType.String), # Change the type to Collection(Edm.String)\n", - " searchable=True, retrievable=True,\n", - " dimensions=1536, # Adjust dimensions as needed\n", - " vector_search_configuration=\"vector-cosmos-config\"),\n", - "\n", - " ]\n", - "\n", - "\n", - "\n", - "\n", + " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n", + " SearchableField(name=\"title\", type=SearchFieldDataType.String, searchable=True, retrievable=True),\n", + " SearchableField(name=\"content\", type=SearchFieldDataType.String, searchable=True, retrievable=True),\n", + " SearchableField(name=\"category\", type=SearchFieldDataType.String, filterable=True, searchable=True, retrievable=True),\n", + " # Ensure dimensions and vectorSearchConfiguration are set for title_vector\n", + " SearchField(name=\"title_vector\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.Single), # Change the type to Collection(Edm.String)\n", + " searchable=True, \n", + " vector_search_dimensions=1536, # Adjust dimensions as needed\n", + " vector_search_profile_name=\"HnswProfile\"),\n", + "\n", + " SearchField(name=\"content_vector\",\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.Single), # Change the type to Collection(Edm.String)\n", + " searchable=True, \n", + " vector_search_dimensions=1536, # Adjust dimensions as needed\n", + " vector_search_profile_name=\"HnswProfile\"),\n", + "\n", + " ]\n", + " \n", " # Define index fields for doc_sample\n", " doc_index_name = \"doc_sample_index\"\n", " doc_fields = [\n", " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n", - " SearchableField(name=\"chunk_content\", type=SearchFieldDataType.String,\n", + " SearchableField(name=\"chunk_content\", type=SearchFieldDataType.Single,\n", " searchable=True, retrievable=True),\n", " SearchField(name=\"chunk_content_vector\",\n", - " type=SearchFieldDataType.Collection(SearchFieldDataType.String), # Change the type to Collection(Edm.String)\n", - " searchable=True, retrievable=True,\n", - " dimensions=1536, # Adjust dimensions as needed\n", - " vector_search_configuration=\"vector-cosmos-config\"),\n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.Single), # Change the type to Collection(Edm.String)\n", + " searchable=True,\n", + " vector_search_dimensions=1536, # Adjust dimensions as needed\n", + " vector_search_profile_name=\"HnswProfile\"),\n", " \n", " ]\n", "\n", "\n", - " \n", + " # Semantic search configuration\n", + " config_text = SemanticConfiguration(\n", + " name=\"ConfigSemantictext\",\n", + " prioritized_fields=SemanticPrioritizedFields (\n", + " title_field=SemanticField(field_name=\"title\"),\n", + " keywords_fields=[SemanticField(field_name=\"category\")],\n", + " content_fields=[SemanticField(field_name=\"content\")]\n", + " )\n", + " )\n", + "\n", + " config_doc = SemanticConfiguration(\n", + " name=\"ConfigSemanticdoc\",\n", + " prioritized_fields=SemanticPrioritizedFields (\n", + " title_field=SemanticField(field_name=\"chunk_content\")\n", + " )\n", + " )\n", + "\n", + "\n", + "\n", + " # Create the configurration\n", + " settings_text = SemanticSearch(configurations=[config_text])\n", + " settings_doc = SemanticSearch(configurations=[config_doc])\n", "\n", - " \n", + " \n", " # Create indexes\n", - " text_index = SearchIndex(name=text_index_name, fields=text_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]), semantic_settings=settings_text)\n", - " doc_index = SearchIndex(name=doc_index_name, fields=doc_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]), semantic_settings=settings_doc)\n", + " text_index = SearchIndex(name=text_index_name, fields=text_fields, vector_search=vector_search,semantic_search= settings_text)\n", + " doc_index = SearchIndex(name=doc_index_name, fields=doc_fields, vector_search=vector_search,semantic_search= settings_doc)\n", "\n", "\n", "\n", " # Create or update indexes\n", " index_client.create_or_update_index(text_index)\n", " print(f'Indexes created or updated: {text_index_name}')\n", - " \n", " index_client.create_or_update_index(doc_index)\n", " print(f'Indexes created or updated: {doc_index_name}')\n", - "\n", - " \n", - "\n", - " \n", " \n", "except HttpResponseError as e:\n", " print(f\"HTTP Error: {e}\")\n", " print(f\"Status Code: {e.status_code}\")\n", " print(f\"Error Message: {e.error.message}\")\n", "\n", - "\n", "\n" ] }, @@ -492,11 +541,10 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "\n", "\n", "def create_datasource(data_source_name, cosmos_db_connection_string, table_name, indexer_client):\n", " try:\n", @@ -523,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -555,13 +603,13 @@ "metadata": {}, "source": [ "### Creating the datasource and index from the functions already defined\n", - "##### Indexers to crawl data from various data sources and insert them into indexes\n", - "##### Data Sources that connect Azure Cognitive Search to Cosmos" + "##### Indexers to crawl data from the data source and insert them into the indexes\n", + "##### Data Source that connect Azure Cognitive Search to Cosmos NoSQL" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ diff --git a/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb index 1b52d92..051a03e 100644 --- a/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb +++ b/code_samples/azure_cosmosdb_nosql/cosmosdb_vector_query.ipynb @@ -4,19 +4,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Vector Search " + "## Vector Search - Azure Cosmos DB No SQL" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Libraries\n" + "#### Libraries\n" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -32,9 +32,9 @@ "from azure.search.documents.models import (\n", " QueryAnswerType,\n", " QueryCaptionType,\n", - " QueryLanguage,\n", - " QueryType,\n", - " Vector \n", + " ##QueryLanguage,\n", + " QueryType\n", + " #Vector \n", ")\n", "from azure.search.documents.indexes.models import (\n", " IndexingSchedule,\n", @@ -48,8 +48,7 @@ " SimpleField,\n", " SemanticField,\n", " VectorSearch,\n", - " SearchIndexerDataSourceConnection,\n", - " \n", + " SearchIndexerDataSourceConnection \n", ")\n", "\n", "\n", @@ -67,7 +66,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Enviromnent variables" + "#### Enviromnent variables" ] }, { @@ -79,57 +78,77 @@ "\n", "load_dotenv()\n", "\n", + "## Cosmos db endpoint format: https://.documents.azure.com\n", "cosmos_db_api_endpoint = os.getenv(\"cosmos_db_api_endpoint\")\n", "if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == \"\":\n", " print(\"cosmos_db_api_endpoint environment variable not set.\")\n", " exit()\n", "\n", + "## Cosmos db API Key\n", "cosmos_db_api_key = os.getenv(\"cosmos_db_api_key\")\n", "if cosmos_db_api_key is None or cosmos_db_api_key == \"\":\n", " print(\"cosmos_db_api_key environment variable not set.\")\n", " exit()\n", "\n", + "##Cosmos Connection String. Format: \n", + "##AccountEndpoint=https://.documents.azure.com;AccountKey=;Database=;\n", + "cosmos_db_connection_string = os.getenv(\"cosmos_db_connection_string\")\n", + "if cosmos_db_connection_string is None or cosmos_db_connection_string == \"\":\n", + " print(\"cosmos_db_connection_string environment variable not set.\")\n", + " exit()\n", + " \n", + "##Cognitive Search Service Name, you need to deploy this service. Format: https://.search.windows.net\n", "cog_search_endpoint = os.getenv(\"cog_search_endpoint\")\n", "if cog_search_endpoint is None or cog_search_endpoint == \"\":\n", " print(\"cog_search_endpoint environment variable not set.\")\n", " exit()\n", "\n", + "##Cognitive Search Service Key\n", "cog_search_key = os.getenv(\"cog_search_key\")\n", "if cog_search_key is None or cog_search_key == \"\":\n", " print(\"cog_search_key environment variable not set.\")\n", " exit()\n", "\n", - "cosmos_db_connection_string = os.getenv(\"cosmos_db_connection_string\")\n", - "if cosmos_db_connection_string is None or cosmos_db_connection_string == \"\":\n", - " print(\"cosmos_db_connection_string environment variable not set.\")\n", - " exit()\n", " \n", - "aoai_embedding_deployed_model = os.getenv(\"aoai_embedding_deployed_model\")\n", + "##Open AI Service. This must be deployed. Format:https://nameoftheservice.azure.com/ \n", + "aoai_endpoint = os.getenv(\"AOAI_ENDPOINT\") ##api_base \n", + "if aoai_endpoint is None or aoai_endpoint == \"\":\n", + " print(\"AOAI_ENDPOINT environment variable not set.\")\n", + " exit()\n", + "\n", + "##Version of the Open AI Service. This was build with the \"2023-05-15\" version\n", + "aoai_api_version = os.getenv(\"AOAI_API_VERSION\")\n", + "if aoai_api_version is None or aoai_api_version == \"\":\n", + " print(\"AOAI_API_VERSION environment variable not set.\")\n", + " exit()\n", + "\n", + "##Model of the Open AI Service. This must be deployed: \"text-embedding-ada-002\"\n", + "aoai_embedding_deployed_model = os.getenv(\"AOAI_EMBEDDING_DEPLOYED_MODEL\")\n", "if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == \"\":\n", - " print(\"aoai_embedding_deployed_model environment variable not set.\")\n", + " print(\"AOAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.\")\n", " exit()\n", + "\n", + "##Open AI ServikeyKeyce.\n", "azure_openai_key = os.getenv(\"AZURE_OPENAI_KEY\")\n", "if azure_openai_key is None or azure_openai_key == \"\":\n", - " print(\"azure_openai_key environment variable not set.\")\n", + " print(\"AZURE_OPENAI_KEY environment variable not set.\")\n", " exit()\n", - "aoai_api_version = os.getenv(\"AOAI_API_VERSION\")\n", - "if aoai_api_version is None or aoai_api_version == \"\":\n", - " print(\"aoai_api_version environment variable not set.\")\n", - " exit() \n", "\n", + "##Container names for the CosmosDB\n", "text_table_name = 'text_sample'\n", "doc_table_name = 'doc_sample'\n", "image_table_name = 'image_sample'\n", "\n", "database_name = \"Vector_DB\"\n", - "credential = AzureKeyCredential(str(cog_search_key))\n" + "credential = AzureKeyCredential(str(cog_search_key))\n", + "openai.api_type = \"azure\"\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Open AI" + "#### Initializing Open AI" ] }, { @@ -143,19 +162,22 @@ "\n", "\n", "openai.api_key = azure_openai_key\n", - "openai.api_version = aoai_api_version\n" + "openai.api_version = aoai_api_version ##2023-05-15##\n", + "openai.api_base = aoai_endpoint\n", + "openai.api_type = \"azure\"\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Initialize the search" + "#### Initialize CosmosDB" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -168,25 +190,51 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Embedding Function" + "#### Embedding and Cosine Functions" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "def get_embedding(text, model=aoai_embedding_deployed_model):\n", - " text = text.replace(\"\\n\", \" \")\n", - " return openai.embeddings.create(input = [text], model=model).data[0].embedding" + "import numpy as np\n", + "from typing import List\n", + "\n", + "\n", + "def get_embedding(text: str, engine, **kwargs) -> List[float]:\n", + " # replace newlines, which can negatively affect performance.\n", + " text = text.replace(\"\\n\", \" \")\n", + "\n", + " return openai.Embedding.create(input=[text], engine=engine, **kwargs)[\"data\"][0][\n", + " \"embedding\"\n", + " ]\n", + "\n", + "\n", + "def cosine_similarity(a, b):\n", + " # Convert the input arrays to numpy arrays\n", + " a = np.asarray(a, dtype=np.float64)\n", + " b = np.asarray(b, dtype=np.float64)\n", + "\n", + " # Check for empty arrays or arrays with zero norms\n", + " if np.all(a == 0) or np.all(b == 0):\n", + " return 0.0\n", + "\n", + " dot_product = np.dot(a, b)\n", + " norm_a = np.linalg.norm(a)\n", + " norm_b = np.linalg.norm(b)\n", + " \n", + " similarity = dot_product / (norm_a * norm_b)\n", + " return similarity\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Simple Vector Search\n", + "#### Vector Search\n", "##### You will use the index name that you created previously on the ingestion steps for the respectively containers\n" ] }, @@ -194,7 +242,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Using Simple Search with Vector" + "#### Simple Vector Search" ] }, { @@ -203,8 +251,7 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "# Query Cosmos DB using Azure Cognitive Search\n", + "import openai\n", "\n", "container_name = 'text_sample'\n", "index_name = \"text_sample_index\"\n", @@ -212,29 +259,34 @@ "container = database.get_container_client(container_name)\n", "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", "\n", - "# Perform Azure Cognitive Search query\n", - "query = 'tools for software development'\n", "\n", + "query = 'tools for software development'\n", + "query_vector = get_embedding(query, k=3, engine=aoai_embedding_deployed_model)\n", "\n", - "search_results = search_client.search( \n", - " search_text=\"\", \n", - " vector=Vector(value=get_embedding(query, model=aoai_embedding_deployed_model), fields = \"content_vector\"), \n", - " select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"]\n", - " )\n", "\n", + "# Perform Azure Cognitive Search query\n", + "search_results = search_client.search(search_text=query, select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"])\n", "\n", "for result in search_results:\n", - " print(f\"Title: {result['title']}\")\n", - " print(f\"Score: {result['@search.score']}\")\n", - " print(f\"Content: {result['content']}\")\n", - " print(f\"Category: {result['category']}\")\n" + " result_vector = result.get(\"content_vector\", None)\n", + "\n", + " if result_vector is not None and len(result_vector) > 0:\n", + " similarity_score = cosine_similarity(query_vector, result_vector)\n", + "\n", + " print(f\"Title: {result['title']}\")\n", + " print(f\"Score: {result['@search.score']}\")\n", + " print(f\"Content: {result['content']}\")\n", + " print(f\"Category: {result['category']}\")\n", + " print(f\"Cosine Similarity: {similarity_score}\\n\")\n", + " else:\n", + " print(f\"Skipping result with empty or missing vector.\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Cross Search (Two columns)" + "#### Cross Search (two columns)" ] }, { @@ -250,7 +302,7 @@ "search_client = SearchClient(cog_search_endpoint, index_name, credential) \n", "\n", "query = 'tools for software development'\n", - "query_vector = get_embedding(query, model=aoai_embedding_deployed_model)\n", + "query_vector = get_embedding(query, engine=aoai_embedding_deployed_model)\n", "\n", "# Perform Azure Cognitive Search query\n", "search_results = search_client.search(\n", @@ -292,7 +344,7 @@ "metadata": {}, "outputs": [], "source": [ - "##Enable the service: https://learn.microsoft.com/en-us/azure/search/semantic-how-to-enable-disable?tabs=enable-portal\n", + "##You must Enable the service: https://learn.microsoft.com/en-us/azure/search/semantic-how-to-enable-disable?tabs=enable-portal\n", "##Use the Semantic configuration defined at the Ingestion\n", "\n", "container_name = \"text_sample\"\n", @@ -308,7 +360,7 @@ "search_results = search_client.search(\n", " search_text=query,\n", " select=[\"title\", \"content\", \"category\", \"title_vector\", \"content_vector\"],\n", - " query_type=QueryType.SEMANTIC, query_language=QueryLanguage.EN_US, semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,\n", + " query_type=QueryType.SEMANTIC, semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,\n", " top=5\n", ")\n", "\n", @@ -343,7 +395,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Document search example" + "#### Document Vector Search" ] }, { @@ -361,12 +413,10 @@ "# Query Cosmos DB using Azure Cognitive Search\n", "\n", "query = 'web hosting services'\n", + "query_vector = get_embedding(query, k=3, engine=aoai_embedding_deployed_model)\n", "\n", - "search_results = search_client.search( \n", - " search_text=\"\", \n", - " vector=Vector(value=get_embedding(query, engine=aoai_embedding_deployed_model), fields = \"chunk_content_vector\"), \n", - " select=[\"chunk_content\", \"chunk_content_vector\"]\n", - " )\n", + "# Perform Azure Cognitive Search query\n", + "search_results = search_client.search(search_text=query, select=[\"chunk_content\", \"chunk_content_vector\"])\n", "\n", "\n", "for result in search_results:\n", @@ -403,7 +453,7 @@ "search_results = search_client.search(\n", " search_text=query,\n", " select=[\"chunk_content\", \"chunk_content_vector\"],\n", - " query_type=QueryType.SEMANTIC, query_language=QueryLanguage.EN_US, semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,\n", + " query_type=QueryType.SEMANTIC, semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,\n", " top=5\n", ")\n", "\n", diff --git a/code_samples/azure_cosmosdb_nosql/requirements.txt b/code_samples/azure_cosmosdb_nosql/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1a16fcc8e46e2b81fbab876c28fb86fcb71bb2b GIT binary patch literal 1580 zcma)+L2uJQ5QXQA#7_w#b_wOcAK=8H5@*Liv69%P-cpqx54>;2>vfX~khM8v-^{#u zZ)Si0YV4=o+ns%}jkPw|N#9_-z1O$3SAHtHwwLz8&ap&gTxIWse)}wL@|=@nuY`Kz zdCxOW%GQWUYv07>vskoZcGADSC5z~-^C8FUi%Okh0^14yxG>v#y-WPbZk0VPeUGY^ z=8kIR51#*uTWjzq`uI*ohwyVdp9V3u1JZT#B5MhBhI0DwYYQ1@F{v@dpjz6 zKPytX`nVO5UzK;z6rkPFIx$R(c61cHPO6=3^i0Wdkq`FQI+e5U-LY|{eq}q|)_yxE zo+^#1FgIw-Y~>pH9v6-}c%Q`XQ64pljVPwF(wF)FEo#zC9F}hWKZ@Fo&s`^{Vl#UC z;p~%mupu##^l+vA_OqI!;#$?jsZOX{uO2RkMj(3eYUT`dQo1P$#{|c&pi#JmwrmNW zJAm0{T#s59&}{$CCzg8Uj&6BbnBwAo_{Z+D-`~Za+x%j2TdPu514oiDqkd8GIA_}V z`w9Ell}WWaq1NwMnUc6Y2*LY>f4E&(