-
Notifications
You must be signed in to change notification settings - Fork 3
/
make_lists.R
121 lines (108 loc) · 4.6 KB
/
make_lists.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -------------------------------------+
# Stopwords: Latin and Greek
# Make TXT/Markdown stoplists from JSON
# -------------------------------------+
source("~/Documents/github/helpers/R/helpers.R")
# GREEK -------------------------------------------------------------------
# Set version number
version_greek <- "2.8"
# Convert current JSON list to TXT with Markdown headings
greek_json <- read_file("stopwords_greek.json")
greek_json %>%
str_replace_all("^\\{\n", "") %>%
str_replace_all("\\}\n\\}\n", "") %>%
str_replace_all("\\s*\"([A-Z]+.*)\": [\\[\\{\\],]+\n", "# \\1\n") %>%
str_replace_all("\\s*\"(.*)\": \\[\n", "## \\1\n") %>%
# str_replace_all("[\":\\{\\}\\[\\],]", "\n") %>%
str_replace_all("\\:(?!\")", "\n") %>%
str_replace_all("\\[(?!\")", "\n") %>%
str_replace_all("\\](?!\")", "\n") %>%
str_replace_all("\\{(?!\")", "\n") %>%
str_replace_all("\\}(?!\")", "\n") %>%
str_replace_all("\\\\(?!\")", "\n") %>%
str_replace_all(",", "\n") %>%
str_replace_all('\\"(?!\")', "\n") %>%
str_replace_all("\\\\", "\n") %>%
str_replace_all("\n\\s+", "\n") %>%
str_replace_all("##", "\n##") %>%
str_replace_all("\n#(.)", "\n\n#\\1") %>%
str_replace_all("\n\n\n", "\n\n") %>%
write_file("./test/test_json_txt/stopwords_greek_raw.txt")
# Add metadata
today <- format(Sys.time(), "%Y-%m-%d")
greek_raw <- read_file("./test/test_json_txt/stopwords_greek_raw.txt")
current_greek_count <- count_items_in_txt_list("./test/test_json_txt/stopwords_greek_raw.txt")
greek_metadata <- paste0(
"# Ancient Greek stopwords", "\n",
"# version ", version_greek, "\n",
"# ", today, "\n",
"# Aurélien Berra", "\n",
"# ", "\n",
"# Ancient Greek stopwords for textual analysis", "\n",
"# language: Ancient Greek (grc)", "\n",
"# type: dataset", "\n",
"# items count: ", current_greek_count, "\n",
"# https://github.com/aurelberra/stopwords", "\n",
"# rights: CC-BY-NC-SA", "\n",
"\n"
)
stopwords_greek <- paste0(greek_metadata, greek_raw)
stopwords_greek <- utf8::utf8_normalize(stopwords_greek)
write_file(stopwords_greek, "stopwords_greek.txt")
write_file(stopwords_greek, paste("./versions/stopwords_greek_v", str_replace(version_greek, "\\.", "_"), ".txt", sep = ""))
# Make file without categories as comments
greek_raw %>%
str_replace_all("#.+\n", "") %>%
str_replace_all("\n+", "\n") %>%
write_file("./test/test_json_txt/stopwords_greek_no_comments.txt")
# LATIN -------------------------------------------------------------------
# Set version number
version_latin <- "2.6"
# Convert current JSON list to TXT with Markdown headings
latin_json <- read_file("stopwords_latin.json")
latin_json %>%
str_replace_all("^\\{\n", "") %>%
str_replace_all("\\}\n\\}\n", "") %>%
str_replace_all("\\s*\"([A-Z]+.*)\": [\\[\\{\\],]+\n", "# \\1\n") %>%
str_replace_all("\\s*\"(.*)\": \\[\n", "## \\1\n") %>%
# str_replace_all("[\":\\{\\}\\[\\],]", "\n") %>%
str_replace_all("\\:(?!\")", "\n") %>%
str_replace_all("\\[(?!\")", "\n") %>%
str_replace_all("\\](?!\")", "\n") %>%
str_replace_all("\\{(?!\")", "\n") %>%
str_replace_all("\\}(?!\")", "\n") %>%
str_replace_all("\\\\(?!\")", "\n") %>%
str_replace_all(",", "\n") %>%
str_replace_all('\\"(?!\")', "\n") %>%
str_replace_all("\\\\", "\n") %>%
str_replace_all("\n\\s+", "\n") %>%
str_replace_all("##", "\n##") %>%
str_replace_all("\n#(.)", "\n\n#\\1") %>%
str_replace_all("\n\n\n", "\n\n") %>%
write_file("./test/test_json_txt/stopwords_latin_raw.txt")
# Add metadata
today <- format(Sys.time(), "%Y-%m-%d")
latin_raw <- read_file("./test/test_json_txt/stopwords_latin_raw.txt")
current_latin_count <- count_items_in_txt_list("./test/test_json_txt/stopwords_latin_raw.txt")
latin_metadata <- paste0(
"# Ancient Latin stopwords", "\n",
"# version ", version_latin, "\n",
"# ", today, "\n",
"# Aurélien Berra", "\n",
"# ", "\n",
"# Ancient Latin stopwords for textual analysis", "\n",
"# language: Latin (la, lat)", "\n",
"# type: dataset", "\n",
"# items count: ", current_latin_count, "\n",
"# https://github.com/aurelberra/stopwords", "\n",
"# rights: CC-BY-NC-SA", "\n",
"\n"
)
stopwords_latin <- paste0(latin_metadata, latin_raw)
write_file(stopwords_latin, "stopwords_latin.txt")
write_file(stopwords_latin, paste("./versions/stopwords_latin_v", str_replace(version_latin, "\\.", "_"), ".txt", sep = ""))
# Make file without categories as comments
latin_raw %>%
str_replace_all("#.+\n", "") %>%
str_replace_all("\n+", "\n") %>%
write_file("./test/test_json_txt/stopwords_latin_no_comments.txt")