Skip to content

Commit

Permalink
fix merge
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnJyong committed Dec 25, 2024
1 parent 588fe21 commit 148b0b4
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 36 deletions.
13 changes: 5 additions & 8 deletions api/core/indexing_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from configs import dify_config
from core.entities.knowledge_entities import IndexingEstimate, PreviewDetail, QAPreviewDetail
from core.errors.error import ProviderTokenNotInitError
from core.llm_generator.llm_generator import LLMGenerator
from core.model_manager import ModelInstance, ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.rag.cleaner.clean_processor import CleanProcessor
Expand Down Expand Up @@ -294,15 +293,15 @@ def indexing_estimate(
process_rule=processing_rule.to_dict(),
tenant_id=current_user.current_tenant_id,
doc_language=doc_language,
preview=True
preview=True,
)
total_segments += len(documents)
for document in documents:
if len(preview_texts) < 10:
if doc_form and doc_form == "qa_model":
preview_detail = QAPreviewDetail(question=document.page_content,
answer=document.metadata.get("answer")
)
preview_detail = QAPreviewDetail(
question=document.page_content, answer=document.metadata.get("answer")
)
preview_texts.append(preview_detail)
else:
preview_detail = PreviewDetail(content=document.page_content)
Expand All @@ -325,9 +324,7 @@ def indexing_estimate(
db.session.delete(image_file)

if doc_form and doc_form == "qa_model":
return IndexingEstimate(
total_segments=total_segments * 20, qa_preview=preview_texts, preview=[]
)
return IndexingEstimate(total_segments=total_segments * 20, qa_preview=preview_texts, preview=[])
return IndexingEstimate(total_segments=total_segments, preview=preview_texts)

def _extract(
Expand Down
1 change: 0 additions & 1 deletion api/core/rag/extractor/extract_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
from core.rag.extractor.word_extractor import WordExtractor
from core.rag.models.document import Document
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,7 @@ def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords:
vector.delete()

if delete_child_chunks:
db.session.query(ChildChunk).filter(
ChildChunk.dataset_id == dataset.id
).delete()
db.session.query(ChildChunk).filter(ChildChunk.dataset_id == dataset.id).delete()
db.session.commit()

def retrieve(
Expand Down
12 changes: 7 additions & 5 deletions api/core/rag/index_processor/processor/qa_index_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,13 @@ def transform(self, documents: list[Document], **kwargs) -> list[Document]:
split_documents.append(document_node)
all_documents.extend(split_documents)
if preview:
self._format_qa_document(current_app._get_current_object(),
kwargs.get("tenant_id"),
all_documents[0],
all_qa_documents,
kwargs.get("doc_language", "English"))
self._format_qa_document(
current_app._get_current_object(),
kwargs.get("tenant_id"),
all_documents[0],
all_qa_documents,
kwargs.get("doc_language", "English"),
)
else:
for i in range(0, len(all_documents), 10):
threads = []
Expand Down
9 changes: 6 additions & 3 deletions api/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,9 +571,12 @@ def child_chunks(self):
if process_rule.mode == "hierarchical":
rules = Rule(**process_rule.rules_dict)
if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
child_chunks = db.session.query(ChildChunk).filter(
ChildChunk.segment_id == self.id
).order_by(ChildChunk.position.asc()).all()
child_chunks = (
db.session.query(ChildChunk)
.filter(ChildChunk.segment_id == self.id)
.order_by(ChildChunk.position.asc())
.all()
)
return child_chunks or []
else:
return []
Expand Down
23 changes: 9 additions & 14 deletions api/schedule/mail_clean_document_notify_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from extensions.ext_mail import mail
from models.account import Account, Tenant, TenantAccountJoin
from models.dataset import Dataset, DatasetAutoDisableLog, Document
from models.dataset import Dataset, DatasetAutoDisableLog


@shared_task(queue="mail")
Expand All @@ -20,16 +20,12 @@ def send_document_clean_notify_task():
if not mail.is_inited():
return

logging.info(
click.style("Start send document clean notify mail", fg="green")
)
logging.info(click.style("Start send document clean notify mail", fg="green"))
start_at = time.perf_counter()

# send document clean notify mail
try:
dataset_auto_disable_logs = DatasetAutoDisableLog.query.filter(
DatasetAutoDisableLog.notified == False
).all()
dataset_auto_disable_logs = DatasetAutoDisableLog.query.filter(DatasetAutoDisableLog.notified == False).all()
# group by tenant_id
dataset_auto_disable_logs_map = {}
for dataset_auto_disable_log in dataset_auto_disable_logs:
Expand All @@ -47,25 +43,24 @@ def send_document_clean_notify_task():

dataset_auto_dataset_map = {}
for dataset_auto_disable_log in tenant_dataset_auto_disable_logs:
dataset_auto_dataset_map[dataset_auto_disable_log.dataset_id].append(dataset_auto_disable_log.document_id)

dataset_auto_dataset_map[dataset_auto_disable_log.dataset_id].append(
dataset_auto_disable_log.document_id
)

for dataset_id, document_ids in dataset_auto_dataset_map.items():
dataset = Dataset.query.filter(Dataset.id == dataset_id).first()
if dataset:
document_count = len(document_ids)
knowledge_details.append(f"<li>Knowledge base {dataset.name}: {document_count} documents</li>")

html_content = render_template(
html_content = render_template(
"clean_document_job_mail_template-US.html",
)
mail.send(to=to, subject="立即加入 Dify 工作空间", html=html_content)


end_at = time.perf_counter()
logging.info(
click.style(
"Send document clean notify mail succeeded: latency: {}".format(end_at - start_at), fg="green"
)
click.style("Send document clean notify mail succeeded: latency: {}".format(end_at - start_at), fg="green")
)
except Exception:
logging.exception("Send invite member mail to {} failed".format(to))
4 changes: 2 additions & 2 deletions api/tasks/add_document_to_index_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from core.rag.models.document import ChildDocument, Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DatasetAutoDisableLog, Document as DatasetDocument
from models.dataset import DocumentSegment
from models.dataset import DatasetAutoDisableLog, DocumentSegment
from models.dataset import Document as DatasetDocument


@shared_task(queue="dataset")
Expand Down

0 comments on commit 148b0b4

Please sign in to comment.