Merge branch 'azure_search' into 'develop'

merge azure search to develop branch

See merge request olympus_ai/apollo!6
This commit is contained in:
Sumedh
2024-10-29 05:38:42 +00:00
7 changed files with 128 additions and 136 deletions

20
pom.xml
View File

@@ -58,7 +58,7 @@
<artifactId>spring-boot-starter-data-rest</artifactId>
</dependency>
<dependency>
<!--<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-mongodb-atlas-store-spring-boot-starter</artifactId>
</dependency>
@@ -67,11 +67,23 @@
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-openai-spring-boot-starter</artifactId>
</dependency>
-->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-azure-openai-spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-azure-store</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-netflix-eureka-client</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
@@ -87,6 +99,12 @@
<version>1.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
<version>2.5.0</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>

View File

@@ -1,10 +1,18 @@
package com.olympus.apollo.config;
import com.azure.core.credential.AzureKeyCredential;
import com.azure.search.documents.indexes.SearchIndexClient;
import com.azure.search.documents.indexes.SearchIndexClientBuilder;
import org.springframework.ai.azure.openai.AzureOpenAiEmbeddingModel;
import org.springframework.ai.embedding.EmbeddingModel;
import org.springframework.ai.openai.OpenAiEmbeddingModel;
/*import org.springframework.ai.openai.OpenAiEmbeddingModel;
import org.springframework.ai.openai.api.OpenAiApi;
import org.springframework.ai.vectorstore.MongoDBAtlasVectorStore;
import org.springframework.ai.vectorstore.MongoDBAtlasVectorStore;*/
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.ai.vectorstore.azure.AzureVectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringBootConfiguration;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
@@ -12,34 +20,39 @@ import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.mongodb.core.MongoTemplate;
import java.util.ArrayList;
import java.util.List;
@Configuration
@SpringBootConfiguration
@EnableAutoConfiguration
public class EmbeddingConfig {
@Value("${spring.ai.openai.api-key}")
private String openAiKey;
@Value("${spring.data.mongodb.database}")
private String databaseName;
@Value("${spring.ai.vectorstore.mongodb.collection-name:vector_store}")
private String collectionName;
@Value("${spring.ai.vectorstore.mongodb.indexName:vector_index}")
private String indexName;
@Value("${spring.data.mongodb.uri}")
private String mongoUri;
@Value("${spring.ai.vectorstore.mongodb.initialize-schema}")
private Boolean initSchema;
// Add beans here...
@Value("${spring.ai.vectorstore.azure.api-key}")
private String azureKey;
@Value("${spring.ai.vectorstore.azure.url}")
private String azureEndpoint;
@Value("${spring.ai.vectorstore.azure.initialize-schema}")
private boolean initSchema;
@Bean
public EmbeddingModel embeddingModel() {
return new OpenAiEmbeddingModel(new OpenAiApi(openAiKey));
public SearchIndexClient searchIndexClient() {
return new SearchIndexClientBuilder().endpoint(azureEndpoint)
.credential(new AzureKeyCredential(azureKey))
.buildClient();
}
@Bean
public VectorStore mongodbVectorStore(MongoTemplate mongoTemplate, EmbeddingModel embeddingModel) {
return new MongoDBAtlasVectorStore(mongoTemplate, embeddingModel,
MongoDBAtlasVectorStore.MongoDBVectorStoreConfig.builder().build(), initSchema);
public VectorStore vectorStore(SearchIndexClient searchIndexClient, @Qualifier("azureOpenAiEmbeddingModel") EmbeddingModel embeddingModel) {
List<AzureVectorStore.MetadataField> fields = new ArrayList<>();
fields.add(AzureVectorStore.MetadataField.text("KsApplicationName"));
fields.add(AzureVectorStore.MetadataField.text("KsProjectName"));
fields.add(AzureVectorStore.MetadataField.text("KsDoctype"));
fields.add(AzureVectorStore.MetadataField.text("KsDocSource"));
fields.add(AzureVectorStore.MetadataField.text("KsFileSource"));
fields.add(AzureVectorStore.MetadataField.text("KsDocumentId"));
return new AzureVectorStore(searchIndexClient, embeddingModel,initSchema, fields);
}
}

View File

@@ -59,7 +59,6 @@ public class KSFileController {
Date now = new Date();
ksDocument.setIngestionDate(now);
KSIngestionInfo ksIngestionInfo = new KSIngestionInfo();
ksIngestionInfo.setType(fileUploadDTO.getType()); // != null ? type : "MD_DOCUMENT"
@@ -68,6 +67,7 @@ public class KSFileController {
metadata.put("KsDoctype", fileUploadDTO.getKsDocType());
metadata.put("KsDocSource", fileUploadDTO.getKsDocSource());
metadata.put("KsFileSource", file.getOriginalFilename());
metadata.put("KsProjectName", fileUploadDTO.getKsProjectName());
ksIngestionInfo.setMetadata(metadata);
ksIngestionInfo.setDefaultChunkSize(fileUploadDTO.getDefaultChunkSize());
@@ -75,7 +75,7 @@ public class KSFileController {
ksIngestionInfo.setMaxNumberOfChunks(fileUploadDTO.getMaxNumberOfChunks());
ksIngestionInfo.setMinChunkSizeToEmbed(fileUploadDTO.getMinChunkSizeToEmbed());
ksIngestionInfoRepository.save(ksIngestionInfo);
//ksIngestionInfoRepository.save(ksIngestionInfo);
ksDocument.setIngestionInfo(ksIngestionInfo);
ksDocumentREpository.save(ksDocument);

View File

@@ -15,6 +15,7 @@ public class CorsConfig implements WebMvcConfigurer {
public void addCorsMappings(CorsRegistry registry) {
registry.addMapping("/**")
.allowedOrigins(apollo_frontend_url)
.allowedOriginPatterns("**")
.allowedHeaders("*")
.allowedMethods("GET", "POST", "PUT", "DELETE","OPTIONS");
}

View File

@@ -9,12 +9,15 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.olympus.dto.DeletionRequest;
import com.olympus.model.apollo.VectorStore;
import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.SearchRequest;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.messaging.simp.SimpMessagingTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.ai.vectorstore.VectorStore;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
@@ -45,41 +48,33 @@ public class DeletionService {
@Autowired
private SimpMessagingTemplate simpMessagingTemplate;
@Autowired
private VectorStore vectorStore;
@Async("asyncTaskExecutor")
public void deleteRecords(DeletionRequest deletionRequest) {
try {
boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null && !deletionRequest.getKsDocumentId().isEmpty() && ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
boolean KSIngestionInfoExists = deletionRequest.getKsIngestionInfoId() != null && !deletionRequest.getKsIngestionInfoId().isEmpty() && ksIngestionInfoRepository.existsById(deletionRequest.getKsIngestionInfoId());
boolean vectorStoreExists = deletionRequest.getKsApplicationName() != null && deletionRequest.getKsDocSource() != null && deletionRequest.getKsFileSource() != null && deletionRequest.getKsDoctype() != null;
//TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH
// NOT WORKING AT THE MOMENT
boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null &&
!deletionRequest.getKsDocumentId().isEmpty() &&
ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
if(KSDocumentExists){
SearchRequest searchRequest = SearchRequest.defaults()
.withQuery("a").withTopK(1000)
.withSimilarityThreshold(0.0)
.withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'");
List<VectorStore> vectorStoreMetadataDetails = vectorStoreExists ? vectorStoreRepository.findDocumentVectorByMetadata(deletionRequest.getKsDoctype(), deletionRequest.getKsDocSource(), deletionRequest.getKsFileSource(), deletionRequest.getKsApplicationName()) : List.of();
List<Document> docs = vectorStore.similaritySearch(searchRequest);
List<String> ids = docs.stream().map(Document::getId).toList();
vectorStore.delete(ids);
if (KSDocumentExists && KSIngestionInfoExists && !vectorStoreMetadataDetails.isEmpty()) {
if (deletionRequest.getKsDocumentId() != null && !deletionRequest.getKsDocumentId().isEmpty()) {
ksDocumentRepository.deleteById(deletionRequest.getKsDocumentId());
logger.info("KSDocument with id {} deleted successfully.", deletionRequest.getKsDocumentId()+" "+Thread.currentThread().getName());
}
if (deletionRequest.getKsIngestionInfoId() != null && !deletionRequest.getKsIngestionInfoId().isEmpty()) {
ksIngestionInfoRepository.deleteById(deletionRequest.getKsIngestionInfoId());
logger.info("KSIngestionInfo with id {} deleted successfully.", deletionRequest.getKsIngestionInfoId()+" "+Thread.currentThread().getName());
}
for (VectorStore store : vectorStoreMetadataDetails) {
vectorStoreRepository.deleteById(store.getId());
logger.info("VectorStore with id {} deleted successfully.", store.getId()+" "+Thread.currentThread().getName());
}
logger.info("All records deleted successfully.");
} else {
if (!KSDocumentExists) {
logger.warn("KSDocument with id {} does not exist.", deletionRequest.getKsDocumentId()+" "+Thread.currentThread().getName());
} else if (!KSIngestionInfoExists) {
logger.warn("KSIngestionInfo with id {} does not exist.", deletionRequest.getKsIngestionInfoId()+" "+Thread.currentThread().getName());
} else if (vectorStoreMetadataDetails.isEmpty()) {
logger.warn("No VectorStore Data available",Thread.currentThread().getName());
}
ksDocumentRepository.deleteById(deletionRequest.getKsDocumentId());
logger.info("KSDocument with id {} deleted successfully.", deletionRequest.getKsDocumentId());
}else{
logger.warn("KSDocument with id {} does not exist.", deletionRequest.getKsDocumentId());
}
} catch (Exception e) {
logger.error("An error occurred while deleting records: ", e+" "+Thread.currentThread().getName());
@@ -122,9 +117,9 @@ public class DeletionService {
String ingestionStatus = ksGitInfo.getIngestionStatus();
logger.info("Ingestion Status is {}.", ingestionStatus);
List<VectorStore> vectorStoreMetadataDetails = vectorStoreGitDetailsExists
List<VectorStore> vectorStoreMetadataDetails = null; /*vectorStoreGitDetailsExists
? vectorStoreRepository.findGitVectorByMetadata(ksDoctype,ksDocSource, ksFileSource, applicationName, ksBranch)
: List.of();
: List.of();*/
if (KSGitInfoExists && KSGitIngestionInfoExists) {
deleteRecordsBasedOnIngestionStatus(ksGitInfoId,ksBranch,ingestionStatus,ksGitIngestionInfoId,vectorStoreMetadataDetails,applicationName);
@@ -218,11 +213,12 @@ public class DeletionService {
private void deleteVectorStores(List<VectorStore> vectorStoreMetadataDetails, String applicationName){
if(!vectorStoreMetadataDetails.isEmpty()){
for (VectorStore store : vectorStoreMetadataDetails) {
/* for (VectorStore store : vectorStoreMetadataDetails) {
String storeId=store.getId();
vectorStoreRepository.deleteById(storeId);
logger.info("VectorStore with id {} deleted successfully.", applicationName, storeId);
}
}*/
}
}
@@ -233,7 +229,8 @@ public class DeletionService {
try {
boolean KSTextExists = ksTextsRepository.existsById(id);
List<VectorStore> vectorStoreMetadataDetails = vectorStoreRepository.findByKsInternalMainEntityId(id);
/*
List<Object> vectorStoreMetadataDetails = vectorStoreRepository.findByKsInternalMainEntityId(id);
if (KSTextExists && !vectorStoreMetadataDetails.isEmpty()) {
for (VectorStore store : vectorStoreMetadataDetails) {
@@ -252,7 +249,7 @@ public class DeletionService {
} else if (vectorStoreMetadataDetails.isEmpty()) {
logger.warn("No VectorStore Data available",Thread.currentThread().getName());
}
}
}*/
} catch (Exception e) {
logger.error("An error occurred while deleting records: ", e+" "+Thread.currentThread().getName());
throw new RuntimeException("An error occurred while deleting records", e);

View File

@@ -28,15 +28,17 @@ public class KSIngestor {
@Autowired
private KSDocumentRepository ksDocumentRepository;
@Autowired
private KSTextsRepository ksTextsRepository;
@Autowired
private KSIngestionInfoRepository ksIngestionInfoRepository;
@Autowired
private FileSystemStorageService storageService;
@Autowired
private VectorStore vectorStore;
Logger logger = LoggerFactory.getLogger(KSIngestor.class);
public void deleteAll(String document_file_name) {
@@ -48,49 +50,11 @@ public class KSIngestor {
}
public IngestionOutput ingestLoop() {
IngestionOutput ingestionLoopOutput = new IngestionOutput();
try {
ksDocumentRepository.findAllByIngestionStatus("NEW").forEach(ksDocument -> {
logger.info("Processing document: " + ksDocument.getFilePath());
// ingest the document
ksDocument.setIngestionStatus("IN PROGRESS");
ksDocumentRepository.save(ksDocument);
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
List<Document> docs = tikaDocumentReader.read();
logger.info("Ingested document: " + ksDocument.getFilePath());
logger.info("Number of documents: " + docs.size());
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
ingestionInfo.getMinChunkSize(),
ingestionInfo.getMinChunkSizeToEmbed(),
ingestionInfo.getMaxNumberOfChunks(),
true);
docs.forEach(doc -> {
List<Document> splitDocs = splitter.split(doc);
logger.info("Number of documents: " + splitDocs.size());
for (Document splitDoc : splitDocs) {
logger.info("Split before put document metadata: " + splitDoc.getMetadata());
splitDoc.getMetadata().putAll(getMetadata(ingestionInfo));
logger.info("Split after put document metadata: " + splitDoc.getMetadata());
}
embedDocuments(splitDocs, ingestionInfo);
});
ksDocument.setIngestionStatus("INGESTED");//we have to set to DONE
ksDocument.setIngestionDate(new Date());
ksDocument.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
ksDocumentRepository.save(ksDocument);
ingestDocument(ksDocument);
ingestionLoopOutput.getIngestedDocumentId().add(ksDocument.getId());
});
ingestionLoopOutput.setStatus("OK");
@@ -139,12 +103,15 @@ public class KSIngestor {
ingestionInfo.getMaxNumberOfChunks(),
true);
HashMap<String, String> metadata = ingestionInfo.getMetadata();
metadata.put("KsDocumentId",ksDocument.getId());
docs.forEach(doc -> {
List<Document> splitDocs = splitter.split(doc);
logger.info("Number of documents: " + splitDocs.size());
for (Document splitDoc : splitDocs) {
splitDoc.getMetadata().putAll(getMetadata(ingestionInfo));
splitDoc.getMetadata().putAll(metadata);
}
embedDocuments(splitDocs, ingestionInfo);
});
@@ -251,11 +218,13 @@ public class KSIngestor {
}
public List<String> testSimilaritySearch(String query,String filterQuery) {
List<Document> docs = vectorStore.similaritySearch(
SearchRequest.defaults()
.withQuery(query)
.withTopK(5).withSimilarityThreshold(0.8)
.withFilterExpression(filterQuery));
SearchRequest searchRequest = SearchRequest.defaults().withQuery(query).withTopK(5).withSimilarityThreshold(0.1);
if(filterQuery!=null && !filterQuery.isEmpty()){
searchRequest.withFilterExpression(filterQuery);
}
List<Document> docs = vectorStore.similaritySearch(searchRequest);
List<String> result = new ArrayList<String>();
for (Document doc : docs) {
@@ -265,19 +234,4 @@ public class KSIngestor {
}
private HashMap<String, String> getMetadata(KSIngestionInfo ingestionInfo) {
return ingestionInfo.getMetadata();
/* HashMap<String, String> metadata = new HashMap<String, String>();
for (String meta : metadatas) {
String[] keyValue = meta.split(":");
metadata.put(keyValue[0], keyValue[1]);
}
return metadata;*/
}
}

View File

@@ -10,20 +10,23 @@ spring:
application:
name: apollo
ai:
azure:
openai:
endpoint: "https://ai-olympus.openai.azure.com/"
api-key: "9fb33cc69d914d4c8225b974876510b5"
openai:
api-key:
api-key: "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
vectorstore:
mongodb:
uri:
indexName: vector_index
collection-name: vector_store
initialize-schema: false
azure:
api-key: "jxKqZvbMKuo1MwXs8ilEAeRDeswtoTXO1lWX600jP2AzSeDXo1nq"
url: "https://search-olympus.search.windows.net"
initialize-schema: true
data:
mongodb:
uri:
database:
username:
password:
uri: mongodb+srv://olympus_adm:26111979@olympus.l6qor4p.mongodb.net/?retryWrites=true&w=majority&appName=Olympus
database: olympus
username: olympus_adm
password: 26111979
servlet:
multipart:
max-file-size: 5000000MB
@@ -34,8 +37,14 @@ ingestion:
repository:
basepath: C:\\Users\\andrea.terzani\\dev\\Olympus
gitlab:
token:
token: "xxxxxxxx"
path: /mnt/apollo_storage/repository #C:\\repos\\olympus_ai\\gitClone
cloud:
url: "https://gi2tlab.com/api/v4"
token: "xxxxxxxx"
onpremises:
url: "http://localhost:8081/api"
token: "xxxxxxxx"
eureka:
client: