forked from yanqiangmiffy/Chinese-LangChain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_knowledge.py
82 lines (69 loc) · 2.7 KB
/
create_knowledge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author:quincy qiang
@license: Apache Licence
@file: create_knowledge.py
@time: 2023/04/18
@contact: [email protected]
@software: PyCharm
@description: - emoji:https://emojixd.com/pocket/science
"""
import os
import pandas as pd
from langchain.schema import Document
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from tqdm import tqdm
# 中文Wikipedia数据导入示例:
embedding_model_name = '/root/pretrained_models/text2vec-large-chinese'
docs_path = '/root/GoMall/Knowledge-ChatGLM/cache/financial_research_reports'
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
# Wikipedia数据处理
# docs = []
# with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
# for idx, line in tqdm(enumerate(f.readlines())):
# metadata = {"source": f'doc_id_{idx}'}
# docs.append(Document(page_content=line.strip(), metadata=metadata))
#
# vector_store = FAISS.from_documents(docs, embeddings)
# vector_store.save_local('cache/zh_wikipedia/')
# docs = []
#
# with open('cache/zh_wikipedia/wiki.zh-sim-cleaned.txt', 'r', encoding='utf-8') as f:
# for idx, line in tqdm(enumerate(f.readlines())):
# metadata = {"source": f'doc_id_{idx}'}
# docs.append(Document(page_content=line.strip(), metadata=metadata))
#
# vector_store = FAISS.from_documents(docs, embeddings)
# vector_store.save_local('cache/zh_wikipedia/')
# 金融研报数据处理
docs = []
for doc in tqdm(os.listdir(docs_path)):
if doc.endswith('.txt'):
# print(doc)
# loader = UnstructuredFileLoader(f'{docs_path}/{doc}', mode="elements")
# doc = loader.load()
f=open(f'{docs_path}/{doc}','r',encoding='utf-8')
# docs.extend(doc)
docs.append(Document(page_content=''.join(f.read().split()), metadata={"source": f'doc_id_{doc}'}))
vector_store = FAISS.from_documents(docs, embeddings)
vector_store.save_local('cache/financial_research_reports')
# # 英雄联盟
#
# docs = []
#
# lol_df = pd.read_csv('cache/lol/champions.csv')
# # lol_df.columns = ['id', '英雄简称', '英雄全称', '出生地', '人物属性', '英雄类别', '英雄故事']
# print(lol_df)
#
# for idx, row in lol_df.iterrows():
# metadata = {"source": f'doc_id_{idx}'}
# text = ' '.join(row.values)
# # for col in ['英雄简称', '英雄全称', '出生地', '人物属性', '英雄类别', '英雄故事']:
# # text += row[col]
# docs.append(Document(page_content=text, metadata=metadata))
#
# vector_store = FAISS.from_documents(docs, embeddings)
# vector_store.save_local('cache/lol/')