Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment: much more powerful MDS type system ("DBS"). #384

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions scripts/dbs/compare_ints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# Copyright 2023 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Chart int serialization efficiency (byte widths when serialized).

Results are compared:
- Across many orders of magnitude (rows).
- Across many DBS types (columns).
"""

import json
import pickle
from argparse import ArgumentParser, Namespace
from typing import Iterator, Optional

import numpy as np

uint_dtypes = np.uint8, np.uint16, np.uint32, np.uint64
int_dtypes = np.int8, np.int16, np.int32, np.int64


def parse_args() -> Namespace:
"""Porse command-line arguments.

Returns:
Namespace: Command-line arguments.
"""
args = ArgumentParser()
args.add_argument('--powers_of_ten',
type=int,
default=24,
help='Size of the range of ints to chart, both in the positive and ' +
'negative directions')
args.add_argument('--col_width',
type=int,
default=3,
help='Widths of the columns describing int serialized sizes')
return args.parse_args()


def norm_len(text: str, size: int) -> str:
"""Normalie text length to the given size by padding. Must fit.

Args:
text (str): Original text.
size (int): Desired text length.

Returns:
str: Normalized text.
"""
if size < len(text):
raise ValueError(f'Field is too big for its column: {text} (len(text) vs {size} chrs).')
return text.rjust(size)


def dtype_to_col_name(dtype: type, col_width: int) -> str:
"""Convert numpy data type to fixed-length column name.

Args:
dtype (type): Numpy data type.
col_width (int): Pad to desired column width (must fit).

Returns
str: Fixed-length column name.
"""
s = dtype.__name__
s = s.replace('uint', 'u')
s = s.replace('int', 'i')
return norm_len(s, col_width)


def each_col_name(powers_of_ten: int, col_width: int) -> Iterator[str]:
"""Get each column name.

Args:
powers_of_ten (int): Numbeer of powers of ten to chart, which affects some column widths.
col_width (int): Pad to desired column width (must fit).

Returns:
Iterator[str]: Iterator over column names.
"""
num_commas = powers_of_ten // 3
yield 'int'.rjust(1 + powers_of_ten + num_commas)
for dtype in uint_dtypes + int_dtypes:
yield dtype_to_col_name(dtype, col_width)
yield norm_len('jsn', col_width)
yield norm_len('pkl', col_width)


def chr_to_div(ch: str) -> str:
"""Convert a character of header to a character of divider.

Args:
ch (str): Character of header.

Returns:
str: Character of divider.
"""
if ch == ' ':
return ' '
elif ch == '|':
return '+'
else:
return '-'


def each_int(powers_of_ten: int) -> Iterator[int]:
"""Get each int to chart.

Args:
powers_of_ten (int): Number of powers of ten to chart, both positive and negative.

Returns:
Iterator[int]: Iterator over ints (rows).
"""
for i in filter(bool, range(-powers_of_ten, powers_of_ten + 1)):
mul = 1 if 0 <= i else -1
exp = abs(i) - 1
yield mul * 10**exp


def get_size_as(val: int, dtype: type) -> Optional[int]:
"""Get the size of the int as a dtype, if applicable.

Args:
val (int): Int to convert.
dtype (type): Numpy data type to convert to.

Returns:
Optional[int]: Data type size in bytes on success, or None on failure.
"""
try:
np_val = dtype(val)
except OverflowError:
return None
if val != np_val:
return None
return np_val.nbytes


def each_field(val: int, powers_of_ten: int, col_width: int) -> Iterator[str]:
"""Each each field str to chsrt.

Args:
val (int): Int to convert to various types/dtypes.
powers_of_ten (int): Number of powers of ten to chart, both positive and negative.
col_width (int): Widths of the columns showing the various serialized sizes of the int.

Returns:
Iterator[str}: Each field of a row of the chart.
"""
val_str = f'{val:,}'
num_commas = powers_of_ten // 3
yield norm_len(val_str, 1 + powers_of_ten + num_commas)
for dtype in uint_dtypes + int_dtypes:
size = get_size_as(val, dtype)
size_str = str(size) if size else ''
yield norm_len(size_str, col_width)
jsn_str = json.dumps(val)
jsn_data = jsn_str.encode('utf-8')
jsn_field = str(len(jsn_data))
yield norm_len(jsn_field, col_width)
pkl_data = pickle.dumps(val)
pkl_field = str(len(pkl_data))
yield norm_len(pkl_field, col_width)


def main(args: Namespace) -> None:
"""Chart how efficiently int serializes across a range of values and types/dtypes."""
header = ' | '.join(each_col_name(args.powers_of_ten, args.col_width))
print(header)
divider = ''.join(map(chr_to_div, header))
print(divider)
for val in each_int(args.powers_of_ten):
print(' | '.join(each_field(val, args.powers_of_ten, args.col_width)))


if __name__ == '__main__':
main(parse_args())
108 changes: 108 additions & 0 deletions streaming/base/format/dbs/type/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
## Hierarchy of DBS types

```
DBSType (all types)
├── Leaf (terminal types)
│   ├── FixLeaf (of fixed serialized size)
│   │   ├── Null (None in python)
│   │   └── Number (single ints, floats, and bools)
│   │   ├── NumpyNumber (specific int and float types)
│   │   │   ├── NumpyFloat (specific float types)
│   │   │   │   ├── Float16
│   │   │   │   ├── Float32
│   │   │   │   └── Float64
│   │   │   └── NumpyInt (specific int types)
│   │   │   ├── Int16
│   │   │   ├── Int32
│   │   │   ├── Int64
│   │   │   ├── Int8
│   │   │   ├── UInt16
│   │   │   ├── UInt32
│   │   │   ├── UInt64
│   │   │   └── UInt8
│   │   └── PythonNumber (generic int, float, and bool types)
│   │   ├── Bool
│   │   ├── Float
│   │   └── Int
│   └── VarLeaf (of varying serialized size)
│   ├── ComplexVarLeaf (structures of objects)
│   │   ├── JSON
│   │   └── Pickle
│   └── SimpleVarLeaf (one object)
│   ├── Bytes
│   ├── Image
│   │   ├── FmtImage
│   │   │   ├── JPG
│   │   │   └── PNG
│   │   └── RawImage
│   ├── NDArray
│   └── Str
├── Tree (recursive types)
│   ├── Dict
│   └── Sequence (lists and tuples)
│   ├── List
│   └── Tuple
└── Union (polymorphic types)
├── Any
├── Just
└── Maybe
```

## Int serialization

Byte widths when serialized:
- Across many orders of magnitude (rows).
- Across many DBS types (columns).

```
int | u8 | u16 | u32 | u64 | i8 | i16 | i32 | i64 | jsn | pkl
--- + -- + --- + --- + --- + -- + --- + --- + --- + --- + ---
-100,000,000,000,000,000,000,000 | | | | | | | | | 25 | 24
-10,000,000,000,000,000,000,000 | | | | | | | | | 24 | 24
-1,000,000,000,000,000,000,000 | | | | | | | | | 23 | 23
-100,000,000,000,000,000,000 | | | | | | | | | 22 | 23
-10,000,000,000,000,000,000 | | | | | | | | | 21 | 23
-1,000,000,000,000,000,000 | | | | | | | | 8 | 20 | 22
-100,000,000,000,000,000 | | | | | | | | 8 | 19 | 22
-10,000,000,000,000,000 | | | | | | | | 8 | 18 | 21
-1,000,000,000,000,000 | | | | | | | | 8 | 17 | 21
-100,000,000,000,000 | | | | | | | | 8 | 16 | 20
-10,000,000,000,000 | | | | | | | | 8 | 15 | 20
-1,000,000,000,000 | | | | | | | | 8 | 14 | 20
-100,000,000,000 | | | | | | | | 8 | 13 | 19
-10,000,000,000 | | | | | | | | 8 | 12 | 19
-1,000,000,000 | | | | | | | 4 | 8 | 11 | 17
-100,000,000 | | | | | | | 4 | 8 | 10 | 17
-10,000,000 | | | | | | | 4 | 8 | 9 | 17
-1,000,000 | | | | | | | 4 | 8 | 8 | 17
-100,000 | | | | | | | 4 | 8 | 7 | 17
-10,000 | | | | | | 2 | 4 | 8 | 6 | 17
-1,000 | | | | | | 2 | 4 | 8 | 5 | 17
-100 | | | | | 1 | 2 | 4 | 8 | 4 | 17
-10 | | | | | 1 | 2 | 4 | 8 | 3 | 17
-1 | | | | | 1 | 2 | 4 | 8 | 2 | 17
1 | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | 1 | 5
10 | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | 2 | 5
100 | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | 3 | 5
1,000 | | 2 | 4 | 8 | | 2 | 4 | 8 | 4 | 15
10,000 | | 2 | 4 | 8 | | 2 | 4 | 8 | 5 | 15
100,000 | | | 4 | 8 | | | 4 | 8 | 6 | 17
1,000,000 | | | 4 | 8 | | | 4 | 8 | 7 | 17
10,000,000 | | | 4 | 8 | | | 4 | 8 | 8 | 17
100,000,000 | | | 4 | 8 | | | 4 | 8 | 9 | 17
1,000,000,000 | | | 4 | 8 | | | 4 | 8 | 10 | 17
10,000,000,000 | | | | 8 | | | | 8 | 11 | 19
100,000,000,000 | | | | 8 | | | | 8 | 12 | 19
1,000,000,000,000 | | | | 8 | | | | 8 | 13 | 20
10,000,000,000,000 | | | | 8 | | | | 8 | 14 | 20
100,000,000,000,000 | | | | 8 | | | | 8 | 15 | 20
1,000,000,000,000,000 | | | | 8 | | | | 8 | 16 | 21
10,000,000,000,000,000 | | | | 8 | | | | 8 | 17 | 21
100,000,000,000,000,000 | | | | 8 | | | | 8 | 18 | 22
1,000,000,000,000,000,000 | | | | 8 | | | | 8 | 19 | 22
10,000,000,000,000,000,000 | | | | 8 | | | | | 20 | 23
100,000,000,000,000,000,000 | | | | | | | | | 21 | 23
1,000,000,000,000,000,000,000 | | | | | | | | | 22 | 23
10,000,000,000,000,000,000,000 | | | | | | | | | 23 | 24
100,000,000,000,000,000,000,000 | | | | | | | | | 24 | 24
```
Loading