Skip to content

Commit

Permalink
Merge pull request #31 from gijshendriksen/use-ciff-toolkit
Browse files Browse the repository at this point in the history
Use CIFF toolkit for reading and writing CIFF files
  • Loading branch information
arjenpdevries authored Jan 26, 2024
2 parents 7f23bf6 + b1c7957 commit 13dc331
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 533 deletions.
67 changes: 27 additions & 40 deletions geesedb/index/fulltext_from_ciff.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import os
import duckdb
from typing import Any, List, Union, Tuple
from ciff_toolkit.read import CiffReader

from ..connection import get_connection
from ..utils import CommonIndexFileFormat_pb2 as Ciff


class FullTextFromCiff:
Expand Down Expand Up @@ -109,50 +109,37 @@ def fill_tables(self) -> None:
with open(self.arguments['protobuf_file'], 'rb') as f:
data = f.read()

# start with reading header info
next_pos, pos = 0, 0
header = Ciff.Header()
next_pos, pos = self.decode(data, pos)
header.ParseFromString(data[pos:pos + next_pos])
pos += next_pos
with CiffReader(self.arguments['protobuf_file']) as reader:
for term_id, postings_list in enumerate(reader.read_postings_lists()):
self.connection.begin()
q = f'INSERT INTO {self.arguments["table_names"][1]} ' \
f'({",".join(self.arguments["columns_names_term_dict"])}) ' \
f"VALUES ({term_id},{postings_list.df},'{postings_list.term}')"
try:
self.cursor.execute(q)
except RuntimeError:
print(q)

docid = 0
for posting in postings_list.postings:
docid += posting.docid
q = f'INSERT INTO {self.arguments["table_names"][2]} ' \
f'({",".join(self.arguments["columns_names_term_doc"])}) ' \
f'VALUES ({term_id},{docid},{posting.tf})'
self.cursor.execute(q)
self.connection.commit()

# read posting lists
postings_list = Ciff.PostingsList()
for term_id in range(header.num_postings_lists):
self.connection.begin()
next_pos, pos = self.decode(data, pos)
postings_list.ParseFromString(data[pos:pos + next_pos])
pos += next_pos
q = f'INSERT INTO {self.arguments["table_names"][1]} ' \
f'({",".join(self.arguments["columns_names_term_dict"])}) ' \
f"VALUES ({term_id},{postings_list.df},'{postings_list.term}')"
try:
self.cursor.execute(q)
except RuntimeError:
print(q)
for posting in postings_list.postings:
q = f'INSERT INTO {self.arguments["table_names"][2]} ' \
f'({",".join(self.arguments["columns_names_term_doc"])}) ' \
f'VALUES ({term_id},{posting.docid},{posting.tf})'
for n, doc_record in enumerate(reader.read_documents()):
if n % 1000 == 0:
self.connection.commit()
self.connection.begin()
q = f'INSERT INTO {self.arguments["table_names"][0]} ' \
f'({",".join(self.arguments["columns_names_docs"])}) ' \
f"VALUES ('{doc_record.collection_docid}',{doc_record.docid},{doc_record.doclength})"
self.cursor.execute(q)
self.connection.commit()

# read doc information
doc_record = Ciff.DocRecord()
self.connection.begin()
for n in range(header.num_docs):
if n % 1000 == 0:
self.connection.commit()
self.connection.begin()
next_pos, pos = self.decode(data, pos)
doc_record.ParseFromString(data[pos:pos + next_pos])
pos += next_pos
q = f'INSERT INTO {self.arguments["table_names"][0]} ' \
f'({",".join(self.arguments["columns_names_docs"])}) ' \
f"VALUES ('{doc_record.collection_docid}',{doc_record.docid},{doc_record.doclength})"
self.cursor.execute(q)
self.connection.commit()


if __name__ == '__main__':
parser = argparse.ArgumentParser()
Expand Down
3 changes: 1 addition & 2 deletions geesedb/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .ciff import CommonIndexFileFormat_pb2
from .ciff.to_csv import ToCSV
from .ciff.to_ciff import ToCiff

__all__ = ['CommonIndexFileFormat_pb2', 'ToCSV', 'ToCiff']
__all__ = ['ToCSV', 'ToCiff']
57 changes: 0 additions & 57 deletions geesedb/utils/ciff/CommonIndexFileFormat.proto

This file was deleted.

Loading

0 comments on commit 13dc331

Please sign in to comment.