Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/support parent child chunk #12092

Merged
merged 60 commits into from
Dec 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
41abf1b
sync parent child chunk
JohnJyong Oct 9, 2024
3bcaa0e
sync parent child chunk
JohnJyong Oct 10, 2024
9d48fe4
add child chunk binding
JohnJyong Oct 10, 2024
397a2fd
parent-child retrieval
JohnJyong Oct 12, 2024
af28b65
parent-child retrieval
JohnJyong Oct 12, 2024
935fe5e
parent-child retrieval
JohnJyong Oct 14, 2024
62473c8
parent-child retrieval
JohnJyong Oct 15, 2024
7e2884e
child chunk curd
JohnJyong Oct 16, 2024
b95728a
parent-child support for all event
JohnJyong Oct 17, 2024
23a1188
Merge branch 'main' into feat/support-parent-child-chunk
JohnJyong Oct 17, 2024
441f357
parent-child support for all event
JohnJyong Oct 17, 2024
5d5d984
parent-child support for all event
JohnJyong Oct 18, 2024
a170023
parent-child support for all event
JohnJyong Oct 18, 2024
25b550c
parent-child support for all event
JohnJyong Oct 18, 2024
3c11cc0
add single child-chunk update
JohnJyong Nov 18, 2024
b8237ad
Merge branch 'main' into feat/support-parent-child-chunk
JohnJyong Nov 21, 2024
2165795
merge main
JohnJyong Nov 22, 2024
bc1f8bd
parent-child fix
JohnJyong Nov 25, 2024
e387348
update text spliter
JohnJyong Nov 27, 2024
42bad3e
Merge branch 'main' into feat/support-parent-child-chunk
JohnJyong Nov 27, 2024
cd58f47
fix issues
JohnJyong Nov 28, 2024
b807d65
fix Parent-Child issues
JohnJyong Dec 4, 2024
d4384e6
fix bugs
JohnJyong Dec 5, 2024
f33a0b2
Merge branch 'main' into feat/support-parent-child-chunk
JohnJyong Dec 5, 2024
206684d
add auto disable dataset logs
JohnJyong Dec 5, 2024
e205e8f
add auto disable dataset logs
JohnJyong Dec 6, 2024
4f3a976
add auto disable dataset logs
JohnJyong Dec 6, 2024
ce6637b
add batch_clean_document_task
JohnJyong Dec 6, 2024
21ee066
update tidb batch get endpoint to basic mode
JohnJyong Dec 6, 2024
9ea375f
update tidb batch get endpoint to basic mode
JohnJyong Dec 6, 2024
5d92205
fix segment position issue
JohnJyong Dec 9, 2024
6a2125d
delete document
JohnJyong Dec 9, 2024
98c8bfd
fix document Retrieval model
JohnJyong Dec 10, 2024
90e3d73
add doc_form for dataset detail
JohnJyong Dec 10, 2024
ee4778b
fix regenerate child chunks
JohnJyong Dec 10, 2024
b2a1ae4
fix regenerate child chunks
JohnJyong Dec 11, 2024
fde369a
fix regenerate child chunks
JohnJyong Dec 11, 2024
b223b01
add migration
JohnJyong Dec 12, 2024
8f064f3
fix child chunk position
JohnJyong Dec 12, 2024
964dc67
update child chunk max position
JohnJyong Dec 13, 2024
679e90e
show child chunk position
JohnJyong Dec 18, 2024
bc7d1d8
fix document updated
JohnJyong Dec 19, 2024
827f188
fix document updated
JohnJyong Dec 20, 2024
c8b6f6f
fix document updated
JohnJyong Dec 20, 2024
f3095c7
word extractor
JohnJyong Dec 23, 2024
f63b85d
qa extractor
JohnJyong Dec 23, 2024
515e582
qa extractor
JohnJyong Dec 23, 2024
71950ee
notion and website import fix
JohnJyong Dec 24, 2024
a250335
notion and website import fix
JohnJyong Dec 24, 2024
98cd57f
notion and website import fix
JohnJyong Dec 24, 2024
50fb8dc
notion and website import fix
JohnJyong Dec 24, 2024
d4ec586
notion and website import fix
JohnJyong Dec 25, 2024
03c45f4
notion and website import fix
JohnJyong Dec 25, 2024
409b684
notion and website import fix
JohnJyong Dec 25, 2024
5910137
notion and website import fix
JohnJyong Dec 25, 2024
d1a76ae
Merge branch 'main' into feat/support-parent-child-chunk
JohnJyong Dec 25, 2024
5604453
fix merge
JohnJyong Dec 25, 2024
588fe21
fix merge
JohnJyong Dec 25, 2024
148b0b4
fix merge
JohnJyong Dec 25, 2024
10f5c8d
fix merge
JohnJyong Dec 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/controllers/console/datasets/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def post(self):
args["doc_form"],
args["doc_language"],
)
return response, 200
return response.model_dump(), 200


class DataSourceNotionDatasetSyncApi(Resource):
Expand Down
15 changes: 14 additions & 1 deletion api/controllers/console/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ def post(self):
except Exception as e:
raise IndexingEstimateError(str(e))

return response, 200
return response.model_dump(), 200


class DatasetRelatedAppListApi(Resource):
Expand Down Expand Up @@ -733,6 +733,18 @@ def get(self, dataset_id):
}, 200


class DatasetAutoDisableLogApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id):
dataset_id_str = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id_str)
if dataset is None:
raise NotFound("Dataset not found.")
return DatasetService.get_dataset_auto_disable_logs(dataset_id_str), 200


api.add_resource(DatasetListApi, "/datasets")
api.add_resource(DatasetApi, "/datasets/<uuid:dataset_id>")
api.add_resource(DatasetUseCheckApi, "/datasets/<uuid:dataset_id>/use-check")
Expand All @@ -747,3 +759,4 @@ def get(self, dataset_id):
api.add_resource(DatasetRetrievalSettingApi, "/datasets/retrieval-setting")
api.add_resource(DatasetRetrievalSettingMockApi, "/datasets/retrieval-setting/<string:vector_type>")
api.add_resource(DatasetPermissionUserListApi, "/datasets/<uuid:dataset_id>/permission-part-users")
api.add_resource(DatasetAutoDisableLogApi, "/datasets/<uuid:dataset_id>/auto-disable-logs")
178 changes: 99 additions & 79 deletions api/controllers/console/datasets/datasets_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from libs.login import login_required
from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
from services.dataset_service import DatasetService, DocumentService
from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
from tasks.add_document_to_index_task import add_document_to_index_task
from tasks.remove_document_from_index_task import remove_document_from_index_task

Expand Down Expand Up @@ -255,20 +256,22 @@ def post(self, dataset_id):
parser.add_argument("duplicate", type=bool, default=True, nullable=False, location="json")
parser.add_argument("original_document_id", type=str, required=False, location="json")
parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")

parser.add_argument(
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
args = parser.parse_args()
knowledge_config = KnowledgeConfig(**args)

if not dataset.indexing_technique and not args["indexing_technique"]:
if not dataset.indexing_technique and not knowledge_config.indexing_technique:
raise ValueError("indexing_technique is required.")

# validate args
DocumentService.document_create_args_validate(args)
DocumentService.document_create_args_validate(knowledge_config)

try:
documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, current_user)
except ProviderTokenNotInitError as ex:
raise ProviderNotInitializeError(ex.description)
except QuotaExceededError:
Expand All @@ -278,6 +281,25 @@ def post(self, dataset_id):

return {"documents": documents, "batch": batch}

@setup_required
@login_required
@account_initialization_required
def delete(self, dataset_id):
dataset_id = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id)
if dataset is None:
raise NotFound("Dataset not found.")
# check user's model setting
DatasetService.check_dataset_model_setting(dataset)

try:
document_ids = request.args.getlist("document_id")
DocumentService.delete_documents(dataset, document_ids)
except services.errors.document.DocumentIndexingError:
raise DocumentIndexingError("Cannot delete document during indexing.")

return {"result": "success"}, 204


class DatasetInitApi(Resource):
@setup_required
Expand Down Expand Up @@ -313,9 +335,9 @@ def post(self):
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
if not current_user.is_dataset_editor:
raise Forbidden()

if args["indexing_technique"] == "high_quality":
if args["embedding_model"] is None or args["embedding_model_provider"] is None:
knowledge_config = KnowledgeConfig(**args)
if knowledge_config.indexing_technique == "high_quality":
if knowledge_config.embedding_model is None or knowledge_config.embedding_model_provider is None:
raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
try:
model_manager = ModelManager()
Expand All @@ -334,11 +356,11 @@ def post(self):
raise ProviderNotInitializeError(ex.description)

# validate args
DocumentService.document_create_args_validate(args)
DocumentService.document_create_args_validate(knowledge_config)

try:
dataset, documents, batch = DocumentService.save_document_without_dataset_id(
tenant_id=current_user.current_tenant_id, document_data=args, account=current_user
tenant_id=current_user.current_tenant_id, knowledge_config=knowledge_config, account=current_user
)
except ProviderTokenNotInitError as ex:
raise ProviderNotInitializeError(ex.description)
Expand Down Expand Up @@ -409,7 +431,7 @@ def get(self, dataset_id, document_id):
except Exception as e:
raise IndexingEstimateError(str(e))

return response
return response.model_dump(), 200


class DocumentBatchIndexingEstimateApi(DocumentResource):
Expand All @@ -422,7 +444,7 @@ def get(self, dataset_id, batch):
documents = self.get_batch_documents(dataset_id, batch)
response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
if not documents:
return response
return response, 200
data_process_rule = documents[0].dataset_process_rule
data_process_rule_dict = data_process_rule.to_dict()
info_list = []
Expand Down Expand Up @@ -509,7 +531,7 @@ def get(self, dataset_id, batch):
raise ProviderNotInitializeError(ex.description)
except Exception as e:
raise IndexingEstimateError(str(e))
return response
return response.model_dump(), 200


class DocumentBatchIndexingStatusApi(DocumentResource):
Expand Down Expand Up @@ -582,15 +604,17 @@ def get(self, dataset_id, document_id):
if metadata == "only":
response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata}
elif metadata == "without":
process_rules = DatasetService.get_process_rules(dataset_id)
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": process_rules,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
Expand All @@ -613,15 +637,17 @@ def get(self, dataset_id, document_id):
"doc_language": document.doc_language,
}
else:
process_rules = DatasetService.get_process_rules(dataset_id)
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": process_rules,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
Expand Down Expand Up @@ -757,9 +783,8 @@ class DocumentStatusApi(DocumentResource):
@login_required
@account_initialization_required
@cloud_edition_billing_resource_check("vector_space")
def patch(self, dataset_id, document_id, action):
def patch(self, dataset_id, action):
dataset_id = str(dataset_id)
document_id = str(document_id)
dataset = DatasetService.get_dataset(dataset_id)
if dataset is None:
raise NotFound("Dataset not found.")
Expand All @@ -774,84 +799,79 @@ def patch(self, dataset_id, document_id, action):
# check user's permission
DatasetService.check_dataset_permission(dataset, current_user)

document = self.get_document(dataset_id, document_id)
document_ids = request.args.getlist("document_id")
for document_id in document_ids:
document = self.get_document(dataset_id, document_id)

indexing_cache_key = "document_{}_indexing".format(document.id)
cache_result = redis_client.get(indexing_cache_key)
if cache_result is not None:
raise InvalidActionError("Document is being indexed, please try again later")
indexing_cache_key = "document_{}_indexing".format(document.id)
cache_result = redis_client.get(indexing_cache_key)
if cache_result is not None:
raise InvalidActionError(f"Document:{document.name} is being indexed, please try again later")

if action == "enable":
if document.enabled:
raise InvalidActionError("Document already enabled.")
if action == "enable":
if document.enabled:
continue
document.enabled = True
document.disabled_at = None
document.disabled_by = None
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

document.enabled = True
document.disabled_at = None
document.disabled_by = None
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()
# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)
add_document_to_index_task.delay(document_id)

add_document_to_index_task.delay(document_id)
elif action == "disable":
if not document.completed_at or document.indexing_status != "completed":
raise InvalidActionError(f"Document: {document.name} is not completed.")
if not document.enabled:
continue

return {"result": "success"}, 200
document.enabled = False
document.disabled_at = datetime.now(UTC).replace(tzinfo=None)
document.disabled_by = current_user.id
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

elif action == "disable":
if not document.completed_at or document.indexing_status != "completed":
raise InvalidActionError("Document is not completed.")
if not document.enabled:
raise InvalidActionError("Document already disabled.")
# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

document.enabled = False
document.disabled_at = datetime.now(UTC).replace(tzinfo=None)
document.disabled_by = current_user.id
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()
remove_document_from_index_task.delay(document_id)

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)
elif action == "archive":
if document.archived:
continue

remove_document_from_index_task.delay(document_id)
document.archived = True
document.archived_at = datetime.now(UTC).replace(tzinfo=None)
document.archived_by = current_user.id
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

return {"result": "success"}, 200
if document.enabled:
# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

elif action == "archive":
if document.archived:
raise InvalidActionError("Document already archived.")
remove_document_from_index_task.delay(document_id)

document.archived = True
document.archived_at = datetime.now(UTC).replace(tzinfo=None)
document.archived_by = current_user.id
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()
elif action == "un_archive":
if not document.archived:
continue
document.archived = False
document.archived_at = None
document.archived_by = None
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

if document.enabled:
# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

remove_document_from_index_task.delay(document_id)

return {"result": "success"}, 200
elif action == "un_archive":
if not document.archived:
raise InvalidActionError("Document is not archived.")

document.archived = False
document.archived_at = None
document.archived_by = None
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)
add_document_to_index_task.delay(document_id)

add_document_to_index_task.delay(document_id)

return {"result": "success"}, 200
else:
raise InvalidActionError()
else:
raise InvalidActionError()
return {"result": "success"}, 200


class DocumentPauseApi(DocumentResource):
Expand Down Expand Up @@ -1022,7 +1042,7 @@ def get(self, dataset_id, document_id):
)
api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
api.add_resource(DocumentMetadataApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata")
api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>")
api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/status/<string:action>/batch")
api.add_resource(DocumentPauseApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause")
api.add_resource(DocumentRecoverApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume")
api.add_resource(DocumentRetryApi, "/datasets/<uuid:dataset_id>/retry")
Expand Down
Loading
Loading