diff --git a/binary_mapping.txt b/binary_mapping.txt index 4f7b9f7..97aa1cb 100644 --- a/binary_mapping.txt +++ b/binary_mapping.txt @@ -1,6 +1,7 @@ cpp/wordcount_baseline cpp/wordcount_baseline.cpp cpp/wordcount cpp/wordcount.cpp c/wordcount c/wordcount.c +c/wordcount_opt c/wordcount_opt.c d/wordcount d/wordcount.d go/bin/wordcount go/src/wordcount/wordcount.go haskell/WordCount haskell/WordCount.hs diff --git a/c/wordcount_opt.c b/c/wordcount_opt.c new file mode 100644 index 0000000..3399890 --- /dev/null +++ b/c/wordcount_opt.c @@ -0,0 +1,176 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define TABLE_SIZE (1L << 27) // Main hash table entries. +#define ENTRIES_MAX_SIZE (1L << 30) // Max number of distinct words. +#define NODE_AREA_SIZE (1L << 35) // Virtual memory area for nodes. +#define IO_CHUNKS 1024 // Number of SSE registers in stdin read. + +struct node { + int32_t count; + int32_t key_length; + struct node *next; + uint8_t key[] __attribute__((aligned(16))); // SSE load. +}; + +static struct node **htable; +static struct node **entries; +static int64_t entry_offset = 0; + +static void *node_mem; +static int64_t node_mem_offset = 0; + +static uint8_t excess_io_bytes[1024 * 1024]; // 1M character word limit. + +static void * virt_alloc(size_t size) +{ + void *ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE, + 0, 0); + if (ptr == MAP_FAILED) { + fprintf(stderr, "mmap failed: %d\n", errno); + exit(EXIT_FAILURE); + } + return ptr; +} + +static void insert_key(uint8_t *key, int length) +{ + uint32_t hash = 0; + + for (int i=0; i != length; i++) + hash = hash * 33 + key[i]; + + struct node **pptr = &htable[hash & (TABLE_SIZE - 1)]; + for (; *pptr; pptr = &(*pptr)->next) { // Search fallback linked list. + struct node *ptr = *pptr; + if (ptr->key_length != length) + continue; + if (!memcmp(ptr->key, key, length)) { // Found duplicate. + ptr->count++; + return; + } + } + struct node *new = &node_mem[node_mem_offset]; + node_mem_offset += (sizeof(struct node) + length + 1 + 15) & ~15; + new->count = 1; + new->key_length = length; + new->next = NULL; + memcpy(new->key, key, length); + + // Add NULL termination to ensure correct comparison when one key + // is a prefix of the compared key. + new->key[length] = '\0'; + *pptr = new; + entries[entry_offset++] = new; +} + +static int compare(const void *aptr, const void *bptr) +{ + const struct node *a = *(const struct node **)aptr; + const struct node *b = *(const struct node **)bptr; + + if (a->count != b->count) + return b->count - a->count; + + __m128i *apacked = (__m128i *)&a->key; + __m128i *bpacked = (__m128i *)&b->key; + + int alen = a->key_length; + int blen = b->key_length; + + for (int i=0; ; i++) { + int idx = _mm_cmpestri(apacked[i], alen, bpacked[i], blen, + _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY); + if (idx < 16) { + int abs = i * 16 + idx; + return (int)a->key[abs] - (int)b->key[abs]; + } + alen -= 16; + blen -= 16; + } +} + +static void parse_input(void) +{ + __m128i separators = _mm_loadu_si128((__m128i *)"\t\n "); + uint8_t buffer[IO_CHUNKS * 16]; + int n, excess = 0; + + do { + int start = 0; + n = read(STDIN_FILENO, buffer, sizeof(buffer)); + if (n < sizeof(buffer)) + memset(&buffer[n], ' ', sizeof(buffer) - n); + + for (int c=0; c != IO_CHUNKS; c++) { + uint16_t mask = _mm_cvtsi128_si32(_mm_cmpestrm(separators, 3, + ((__m128i *)buffer)[c], 16, 0)); + if (!mask) + continue; + + if (excess) { // Use the word in the excess buffer before moving on. + int split = __builtin_ctz(mask); + int acc = c * 16 + split; + memcpy(&excess_io_bytes[excess], buffer, acc); + insert_key(excess_io_bytes, excess + acc); + start = acc + 1; + mask &= ~(1 << split); + excess = 0; + } + + int index = c * 16; + while (mask) { // Find the separators in the 16 byte chunk. + int bit = __builtin_ctz(mask); + index += bit; + if (start != index) + insert_key(&buffer[start], index - start); + start = ++index; + mask >>= bit + 1; + } + } + if (start < n) { // Move rest to excess. + memcpy(&excess_io_bytes[excess], &buffer[start], n - start); + excess += n - start; + } + } while (n == sizeof(buffer)); + + if (excess) // Rare case when the last word touches the end of the buffer. + insert_key(excess_io_bytes, excess); +} + +int main(int argc, char *argv[]) +{ + char info[64]; + int info_length, prev_count = -1; + + htable = calloc(TABLE_SIZE, sizeof(*htable)); + entries = virt_alloc(sizeof(*entries) * ENTRIES_MAX_SIZE); + node_mem = virt_alloc(NODE_AREA_SIZE); + + if ((uintptr_t)node_mem % 16) { + fprintf(stderr, "Node memory not aligned to 16 bytes.\n"); + exit(EXIT_FAILURE); + } + parse_input(); // Parse and insert. + qsort(entries, entry_offset, sizeof(*entries), compare); + + for (int i=0; i != entry_offset; i++) { + struct node *ptr = entries[i]; + + if (ptr->count != prev_count) { + info_length = sprintf(info, "\t%d\n", ptr->count); + prev_count = ptr->count; + } + fwrite(ptr->key, 1, ptr->key_length, stdout); + fwrite(info, 1, info_length, stdout); + } + return EXIT_SUCCESS; +} + diff --git a/run_commands.txt b/run_commands.txt index b0e26b0..18e8e29 100644 --- a/run_commands.txt +++ b/run_commands.txt @@ -1,5 +1,6 @@ bash/wordcount.sh c/wordcount +c/wordcount_opt cpp/wordcount cpp/wordcount_clang cpp/wordcount_baseline diff --git a/scripts/build.sh b/scripts/build.sh index 2832376..2e1f537 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -7,6 +7,7 @@ clang++-3.6 wordcount.cpp -std=c++11 -o wordcount_clang -O3 cd ../c gcc wordcount.c -o wordcount -O3 -Wall +clang-3.6 -O3 -Wall -msse4.2 -std=gnu99 wordcount_opt.c -o wordcount_opt cd ../clojure if [ ! -f ../clojure.jar ]; then @@ -54,4 +55,4 @@ cp target/release/wordcount . cd .. cd ../scala -scalac Wordcount.scala \ No newline at end of file +scalac Wordcount.scala