Skip to content

Commit

Permalink
add doc/doc_extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
yangyaofei committed Dec 16, 2020
1 parent 0a649d7 commit f2b6b92
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 18 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ easier to use.
| DeepClassify |||| | | | |
| Cluster ||| ||| | |
| DocCompare | | | | | | | |
| DocExtractor | | | || | | |
| DocExtractor | | | || | | |
| DocParser | | | | | | | |
| iEncoder | | | | | | | |
| HTMLPaser | | | | | | | |
Expand Down
8 changes: 8 additions & 0 deletions docs/nlpir.native.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ nlpir.native.cluster module
:undoc-members:
:show-inheritance:

nlpir.native.doc\_extractor module
-------------------------------------

.. automodule:: nlpir.native.doc_extractor
:members:
:undoc-members:
:show-inheritance:

nlpir.native.nlpir\_base module
---------------------------------------

Expand Down
8 changes: 8 additions & 0 deletions docs/nlpir.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ nlpir.summary module
:undoc-members:
:show-inheritance:

nlpir.doc\_extractor module
------------------------------

.. automodule:: nlpir.doc_extractor
:members:
:undoc-members:
:show-inheritance:

nlpir.cluster module
-----------------------

Expand Down
3 changes: 2 additions & 1 deletion nlpir/doc_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class ExtractResult:
"""
A class for retrieve result from Document Extractor's handle
"""
#: Types map can be retrieved from DocExtractor
retrieve_type_map: typing.Dict[str, int] = {
"person": native.doc_extractor.DOC_EXTRACT_TYPE_PERSON,
"location": native.doc_extractor.DOC_EXTRACT_TYPE_LOCATION,
Expand Down Expand Up @@ -100,7 +101,7 @@ def get_result(
:param retrieve_types: option, a list of retrieve types want to get,
default is all types can be retrieved or certain types set by :func:`set_retrieve_types`
:return: a dict of result : ``{type_name: [result}]}`` : example
:return: a dict of result : ``{type_name: [result}]}`` , example
::
Expand Down
34 changes: 18 additions & 16 deletions nlpir/native/doc_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@
from ctypes import c_bool, c_char, c_char_p, c_double, c_int, c_uint, POINTER, Structure, byref, c_size_t
import typing

DOC_EXTRACT_TYPE_PERSON = 0 #: 输出的人名
DOC_EXTRACT_TYPE_LOCATION = 1 #: 输出的地名
DOC_EXTRACT_TYPE_ORGANIZATION = 2 #: 输出的机构名
DOC_EXTRACT_TYPE_KEYWORD = 3 #: 输出的关键词
DOC_EXTRACT_TYPE_AUTHOR = 4 #: 输出的文章作者
DOC_EXTRACT_TYPE_MEDIA = 5 #: 输出的媒体
DOC_EXTRACT_TYPE_COUNTRY = 6 #: 输出的文章对应的所在国别
DOC_EXTRACT_TYPE_PROVINCE = 7 #: 输出的文章对应的所在省份
DOC_EXTRACT_TYPE_ABSTRACT = 8 #: 输出文章的摘要
DOC_EXTRACT_TYPE_POSITIVE = 9 #: 输出文章的正面情感词
DOC_EXTRACT_TYPE_NEGATIVE = 10 #: 输出文章的负面情感词
DOC_EXTRACT_TYPE_TEXT = 11 #: 输出文章去除网页等标签后的正文
DOC_EXTRACT_TYPE_TIME = 12 #: 输出时间词
DOC_EXTRACT_TYPE_USER = 13 #: 用户自定义的词类,第一个自定义词
# 后续的自定义词,依次序号为:DOC_EXTRACT_TYPE_USER + 1;DOC_EXTRACT_TYPE_USER + 2;...
DOC_EXTRACT_TYPE_PERSON = 0 #: 人名
DOC_EXTRACT_TYPE_LOCATION = 1 #: 地名
DOC_EXTRACT_TYPE_ORGANIZATION = 2 #: 机构名
DOC_EXTRACT_TYPE_KEYWORD = 3 #: 关键词
DOC_EXTRACT_TYPE_AUTHOR = 4 #: 文章作者
DOC_EXTRACT_TYPE_MEDIA = 5 #: 媒体
DOC_EXTRACT_TYPE_COUNTRY = 6 #: 文章对应的所在国别
DOC_EXTRACT_TYPE_PROVINCE = 7 #: 文章对应的所在省份
DOC_EXTRACT_TYPE_ABSTRACT = 8 #: 文章的摘要
DOC_EXTRACT_TYPE_POSITIVE = 9 #: 文章的正面情感词
DOC_EXTRACT_TYPE_NEGATIVE = 10 #: 文章的负面情感词
DOC_EXTRACT_TYPE_TEXT = 11 #: 文章去除网页等标签后的正文
DOC_EXTRACT_TYPE_TIME = 12 #: 时间词
#: 用户自定义的词类,第一个自定义词
#: 后续的自定义词,依次序号为::data:`DOC_EXTRACT_TYPE_USER` + 1 , :data:`DOC_EXTRACT_TYPE_USER` + 2 , ...
DOC_EXTRACT_TYPE_USER = 13

PERSON_REQUIRED = 0x0001
LOCATION_REQUIRED = 0x0002
Expand Down Expand Up @@ -89,7 +90,8 @@ def pares_doc_e(
生成单文档摘要
:param text: 文档内容
:param user_def_pos: 用户自定义的词性标记,最多三种(人名、地名、机构名、媒体等内置,无需设置),不同词类之间采用#分割,如"gms#gjtgj#g"
:param user_def_pos: 用户自定义的词性标记, 最多三种(人名、地名、机构名、媒体等内置,无需设置, 不同词类之间采用#分割,
如 ``gms#gjtgj#g``
:param summary_needed: 是否需要计算摘要
:param func_required:
:return: 用于获取内容的handle, 获取内容完毕后应使用 :func:`release_handle` 释放对应资源
Expand Down

0 comments on commit f2b6b92

Please sign in to comment.