diff --git a/Dockerfile b/Dockerfile
index 266f125..5ec7d65 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11.1
+FROM python:3.11
WORKDIR /code
COPY ./requirements.txt /code/requirements.txt
diff --git a/app/api/v1/endpoints/stt.py b/app/api/v1/endpoints/stt.py
index 9a06ead..c04f7a5 100644
--- a/app/api/v1/endpoints/stt.py
+++ b/app/api/v1/endpoints/stt.py
@@ -20,7 +20,7 @@ def get_speech_to_text(
) -> SpeechText:
prefix = '.'.join(audio.key.split('.')[:-1])
- s3_controller.download_file(S3_BUCKET_NAME, AUDIO_S3_PREFIX + audio.key, f'stt/static/{audio.key}')
+ s3_controller.download_file(S3_BUCKET_NAME, AUDIO_S3_PREFIX + audio.key, f'app/static/{audio.key}')
stt_controller.convert_to_wav(prefix)
stt_controller.speech_to_text(prefix)
diff --git a/app/controller/keywords.py b/app/controller/keywords.py
index 2b8bb69..92480a3 100644
--- a/app/controller/keywords.py
+++ b/app/controller/keywords.py
@@ -12,4 +12,4 @@ def __init__(self, llm: LLMController = Depends(LLMController)):
def get_keywords(self, document: Document) -> Keywords:
self.llm.set_document(document)
answer = self.llm.request(self.prompt).content
- return Keywords(keywords=list(map(lambda word: word.strip()[1:-1], list(answer[1:-1].split(',')))))
\ No newline at end of file
+ return Keywords(keywords=list(map(lambda word: word.strip(), list(answer[1:-1].split(',')))))
\ No newline at end of file
diff --git a/app/controller/mindmap.py b/app/controller/mindmap.py
index f6b1446..3ceea15 100644
--- a/app/controller/mindmap.py
+++ b/app/controller/mindmap.py
@@ -5,12 +5,12 @@
from app.schemas.mindmap import MindMap
from app.schemas.context import Keywords
-from bs4 import BeautifulSoup
+import re
from typing import List
class MindMapController:
- prompt = "Question: 문맥 내에서 %s들의 계층 구조를 html의
, - 로 알려줘 \nAnswer: "
+ prompt = "Question: 문맥 내에서 %s들의 계층 구조를 MarkDown의 '-'로 알려줘 \nAnswer: -"
def __init__(self, llm: LLMController = Depends(LLMController)):
self.llm = llm
@@ -21,32 +21,36 @@ def delete_stopwords(self, html: str) -> str:
html.replace(stopword, '')
return html
- def parse_html(self, html: str, keywords: List[str]) -> MindMap:
+ def parse_html(self, markdown: str, keywords: List[str]) -> MindMap:
mindmap = MindMap()
mindmap.keywords = keywords
- keyword2index = {w: i for i, w in enumerate(keywords)}
-
- soup = BeautifulSoup(html, 'html.parser')
- prettified_html = soup.prettify()
- prettified_html = list(map(lambda x: x.strip(), prettified_html.split('\n')))
- prettified_html = [html for html in prettified_html if (html in keywords) or (html in [''])]
+ keyword2index = {v: i for i, v in enumerate(keywords)}
+ current = 0
stack = []
- for index, word in enumerate(prettified_html):
- if word == '
': stack.pop(len(stack)-1)
+ lines = markdown.split('\n')
+ for line in lines:
+ sep = line.split('- ')[0]
+ word = line.split('- ')[-1]
if word in keywords:
- if len(stack) == 0:
- stack.append(word)
- mindmap.root = keyword2index[stack[0]]
+ sep = len(sep)
+ mindmap.graph[str(keyword2index[word])] = []
+ print(stack)
+
+ if sep == 0:
+ mindmap.root = sep
+
+ if current == sep:
+ if len(stack) != 0: stack.pop()
+ if len(stack) != 0: mindmap.graph[str(stack[-1])].append(keyword2index[word])
+ stack.append(keyword2index[word])
+ elif current < sep:
+ mindmap.graph[str(stack[-1])].append(keyword2index[word])
+ stack.append(keyword2index[word])
else:
- if prettified_html[index-1] != '': stack.pop(len(stack)-1)
- key = str(keyword2index[mindmap.root]) if len(stack) == 0 else str(keyword2index[stack[len(stack)-1]])
-
- if key not in mindmap.graph.keys():
- mindmap.graph[key] = [keyword2index[word]]
- else:
- mindmap.graph[key].append(keyword2index[word])
- stack.append(word)
+ stack.pop()
+ if len(stack) != 0: mindmap.graph[str(stack[-1])].append(keyword2index[word])
+ current = sep
return mindmap
def get_mindmap(self, document: Document, keywords: List[str]) -> MindMap:
diff --git a/app/controller/stt.py b/app/controller/stt.py
index 15b113f..49e6e00 100644
--- a/app/controller/stt.py
+++ b/app/controller/stt.py
@@ -9,15 +9,15 @@ def __init__(self):
pass
def convert_to_wav(self, prefix):
- output_path = f'stt/static/{prefix}.wav'
- y, sr = librosa.load(f'stt/static/{prefix}.m4a', sr=16000)
+ output_path = f'app/static/{prefix}.wav'
+ y, sr = librosa.load(f'app/static/{prefix}.m4a', sr=16000)
sf.write(output_path, y, sr)
def speech_to_text(self, prefix):
- os.system(f'./stt/whisper/main -m ./stt/whisper/models/ggml-medium.bin -l "ko" -f ./stt/static/{prefix}.wav -oj')
+ os.system(f'./app/whisper/main -m ./app/whisper/models/ggml-medium.bin -l "ko" -f ./app/static/{prefix}.wav -oj')
def get_speech_text(self, prefix):
- with open(f'stt/static/{prefix}.wav.json', 'r') as json_file:
+ with open(f'app/static/{prefix}.wav.json', 'r') as json_file:
json_data = json.load(json_file)
text = ""
diff --git a/requirements.txt b/requirements.txt
index 778e9be..a7c0c60 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,10 +7,8 @@ attrs==23.1.0
audioread==3.0.1
backoff==2.2.1
bcrypt==4.0.1
-beautifulsoup4==4.12.2
-boto3==1.28.84
-botocore==1.31.84
-bs4==0.0.1
+boto3==1.28.85
+botocore==1.31.85
cachetools==5.3.2
certifi==2023.7.22
cffi==1.16.0
@@ -47,8 +45,8 @@ jsonpatch==1.33
jsonpointer==2.4
kubernetes==28.1.0
langchain==0.0.335
-langchainhub==0.1.13
-langsmith==0.0.63
+langchainhub==0.1.14
+langsmith==0.0.64
lazy_loader==0.3
librosa==0.10.1
llvmlite==0.41.1
@@ -65,7 +63,7 @@ numba==0.58.1
numpy==1.26.2
oauthlib==3.2.2
onnxruntime==1.16.2
-openai==1.2.3
+openai==1.2.4
opentelemetry-api==1.21.0
opentelemetry-exporter-otlp-proto-common==1.21.0
opentelemetry-exporter-otlp-proto-grpc==1.21.0
@@ -82,8 +80,8 @@ pulsar-client==3.3.0
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
-pydantic==2.4.2
-pydantic_core==2.10.1
+pydantic==2.5.0
+pydantic_core==2.14.1
pypdf==3.17.0
PyPika==0.48.9
python-dateutil==2.8.2
@@ -100,7 +98,6 @@ scipy==1.11.3
six==1.16.0
sniffio==1.3.0
soundfile==0.12.1
-soupsieve==2.5
soxr==0.3.7
SQLAlchemy==2.0.23
starlette==0.27.0