feat: Update getDocument method in KsDocumentController

Refactor the getDocument method in KsDocumentController to return a single KSDocument object instead of a list. This improves the efficiency and readability of the code. The method now uses the findById method of ksDocumentRepository to retrieve the document with the specified id.
This commit is contained in:
andrea.terzani
2024-07-31 17:30:08 +02:00
parent 1f8c5a062b
commit 1fdd4f52ec
4 changed files with 169 additions and 3 deletions

12
pom.xml
View File

@@ -90,6 +90,14 @@
<artifactId>lombok</artifactId>
<version>1.18.34</version>
</dependency>
<dependency>
<groupId>org.eclipse.jgit</groupId>
<artifactId>org.eclipse.jgit</artifactId>
<version>6.8.0.202311291450-r</version>
</dependency>
</dependencies>
<build>
@@ -133,6 +141,10 @@
<enabled>false</enabled>
</releases>
</repository>
<repository>
<id>jgit-repository</id>
<url>https://repo.eclipse.org/content/groups/releases/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>

View File

@@ -30,9 +30,9 @@ public class KsDocumentController {
return result;
}
@GetMapping("/{id}")
public List<KSDocument> getDocument(@RequestParam String id) {
public KSDocument getDocument(@RequestParam String id) {
List<KSDocument> result = (List<KSDocument>) ksDocumentREpository.findAll();
KSDocument result = ksDocumentREpository.findById(id).get();
return result;
}

View File

@@ -1,10 +1,13 @@
package com.olympus.apollo.controllers;
import java.util.HashMap;
import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RestController;
import com.olympus.apollo.models.KSIngestionInfo;
import com.olympus.apollo.services.GitRepositoryIngestor;
import com.olympus.apollo.services.KSIngestor;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestParam;
@@ -16,7 +19,9 @@ public class TestController {
@Autowired
KSIngestor ksIngestor;
@Autowired
GitRepositoryIngestor gitRepositoryIngestor;
@GetMapping("test/ingestion_loop")
public String testIngestionLoop() {
ksIngestor.ingestLoop();
@@ -34,5 +39,33 @@ public class TestController {
return "Deleted";
}
@GetMapping("test/ingest_repo")
public String ingestRepo() {
try {
KSIngestionInfo ksIngestionInfo = new KSIngestionInfo();
HashMap<String, String> metadata = new HashMap<>();
metadata.put("KsApplicatioName","doo");
metadata.put("KsDoctype","sourcecode");
metadata.put("KsDoSource","GIT");
ksIngestionInfo.setMetadata(metadata);
ksIngestionInfo.setDefaultChunkSize(6000);
ksIngestionInfo.setMinChunkSize(200);
ksIngestionInfo.setMaxNumberOfChunks(10000);
ksIngestionInfo.setMinChunkSizeToEmbed(100);
String repoPath = "C:\\Users\\andrea.terzani\\dev\\DOO2_CLOUD";
gitRepositoryIngestor.ingestGitRepository(repoPath, ksIngestionInfo);
return "Ingested";
} catch (Exception e) {
return "Error";
}
}
}

View File

@@ -0,0 +1,121 @@
package com.olympus.apollo.services;
import org.eclipse.jgit.api.Git;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.springframework.ai.document.Document;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.stereotype.Service;
import com.olympus.apollo.models.KSIngestionInfo;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class GitRepositoryIngestor {
private final VectorStore vectorStore;
public GitRepositoryIngestor( VectorStore vectorStore) {
this.vectorStore = vectorStore;
}
public void ingestGitRepository(String repoPath,KSIngestionInfo ingestionInfo) throws Exception {
try (Git git = Git.open(new File(repoPath))) {
Repository repository = git.getRepository();
RevCommit latestCommit = git.log().setMaxCount(1).call().iterator().next();
try (TreeWalk treeWalk = new TreeWalk(repository)) {
treeWalk.addTree(latestCommit.getTree());
treeWalk.setRecursive(true);
List<Document> documents = new ArrayList<>();
while (treeWalk.next()) {
String filePath = treeWalk.getPathString();
String fileName = treeWalk.getNameString();
if (isRelevantFile(fileName)) {
byte[] fileContent = repository.open(treeWalk.getObjectId(0)).getBytes();
String fileContentStr = new String(fileContent, StandardCharsets.UTF_8);
Map<String, String> metadata = extractMetadata(fileName, fileContentStr);
metadata.put("filePath", filePath);
metadata.put("fileName", fileName);
Document doc = new Document(fileContentStr);
doc.getMetadata().putAll(metadata);
doc.getMetadata().putAll(ingestionInfo.getMetadata());
documents.add(doc);
}
}
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
ingestionInfo.getMinChunkSize(),
ingestionInfo.getMinChunkSizeToEmbed(),
ingestionInfo.getMaxNumberOfChunks(),
false);
List<Document> splitDocuments = splitter.split(documents);
vectorStore.add(splitDocuments);
}
}
}
private boolean isRelevantFile(String fileName) {
// Add more relevant file extensions as needed
return fileName.endsWith(".java");
}
private Map<String, String> extractMetadata(String fileName, String fileContent) {
Map<String, String> metadata = new HashMap<>();
if (fileName.endsWith(".java")) {
metadata.putAll(extractJavaMetadata(fileContent));
} else if (fileName.endsWith(".py")) {
metadata.putAll(extractPythonMetadata(fileContent));
} else if (fileName.endsWith(".js")) {
metadata.putAll(extractJavaScriptMetadata(fileContent));
}
return metadata;
}
private Map<String, String> extractJavaMetadata(String fileContent) {
Map<String, String> metadata = new HashMap<>();
// Simple regex to find class names (this is a basic implementation and might miss some cases)
Pattern classPattern = Pattern.compile("class\\s+(\\w+)");
Matcher classMatcher = classPattern.matcher(fileContent);
List<String> classNames = new ArrayList<>();
while (classMatcher.find()) {
classNames.add(classMatcher.group(1));
}
metadata.put("classNames", String.join(",", classNames));
return metadata;
}
private Map<String, String> extractPythonMetadata(String fileContent) {
// Implement Python-specific metadata extraction
// This is a placeholder and should be implemented based on your needs
return new HashMap<>();
}
private Map<String, String> extractJavaScriptMetadata(String fileContent) {
// Implement JavaScript-specific metadata extraction
// This is a placeholder and should be implemented based on your needs
return new HashMap<>();
}
}