-
Notifications
You must be signed in to change notification settings - Fork 0
/
UTF8decode.py
54 lines (40 loc) · 1.36 KB
/
UTF8decode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
''' UTF-8 Decoding
'''
# Some string to decode (keep only one uncommented)
#s = '00100110'
s = '1100110110101011'
# Case 1: ASCII character
if s[0] == '0':
# Do a simple ASCII conversion
# Step 1: from binary to integer using int in base 2
myint = int(s, 2)
# Fetch the correct ASCII character
mychar = chr(myint)
print ('The ASCII character for %s is "%s"'%(s, mychar))
else:
# OK, we have a larger value and need to unpack the data
# Step 1: how many 1 in the first byte? Can be done by looking for the first 0
char_length = s.find('0')
if char_length == 1:
print('Character %s is not a valid UTF-8 character.'%(s))
exit()
# Step 2: fetch the whole character by picking the first 8 * byte numbers
mychar = s[:8*char_length]
# Step 3: Create a data bit string
data = ''
# Step 4: split into bytes
mybytes = []
for i in range(0, len(mychar), 8):
mybytes.append(mychar[i:i+8])
print('Byte breakdown: ')
for i in range(len(mybytes)):
# Get byte from list
bt = mybytes[i]
# First byte?
if i == 0:
# remove the lead bits
data = bt[ char_length + 1 : ]
# Other bytes (skip the first two characters)
else:
data = data + bt[2:]