From 2879cf4bf7b0c2089e5e5f6a3ed28fb17abd8063 Mon Sep 17 00:00:00 2001 From: Paul Arterburn Date: Fri, 13 Dec 2024 18:20:06 -0700 Subject: [PATCH] Fix words counter --- Gemfile | 2 +- Gemfile.lock | 4 ++-- app/controllers/entries_controller.rb | 2 +- app/models/entry.rb | 3 ++- app/views/entries/review.html.haml | 2 +- lib/tasks/entry.rake | 13 ++++++++++--- 6 files changed, 17 insertions(+), 9 deletions(-) diff --git a/Gemfile b/Gemfile index 1e64a271..f5a136fc 100644 --- a/Gemfile +++ b/Gemfile @@ -37,7 +37,7 @@ gem 'randomized_field', '~> 1.0' # builds user_keys gem 'rest-client' # RESTClient gem 'rubyzip', '~> 2' gem 'summernote-rails', '~> 0.8.20.0', git: "https://github.com/parterburn/summernote-rails" -gem 'words_counted' # Year in Review +gem 'words_counted', '~> 1.0', '>= 1.0.3' # Year in Review gem 'zip-zip' gem "chartkick", "~> 5" gem "loofah", ">= 2.5" diff --git a/Gemfile.lock b/Gemfile.lock index 23f1fb23..da6d32e9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -483,7 +483,7 @@ GEM websocket-driver (0.7.6) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.5) - words_counted (1.0.2) + words_counted (1.0.3) xpath (3.2.0) nokogiri (~> 1.8) zeitwerk (2.6.12) @@ -570,7 +570,7 @@ DEPENDENCIES uglifier webdrivers (~> 5.0) webmock - words_counted + words_counted (~> 1.0, >= 1.0.3) zip-zip RUBY VERSION diff --git a/app/controllers/entries_controller.rb b/app/controllers/entries_controller.rb index 113f5121..16ae163c 100644 --- a/app/controllers/entries_controller.rb +++ b/app/controllers/entries_controller.rb @@ -253,7 +253,7 @@ def review if @total_count.positive? @body_text = @entries.map { |e| ActionView::Base.full_sanitizer.sanitize(e.body) }.join(" ") tokeniser = WordsCounted::Tokeniser.new(@body_text) - @words_counter = tokeniser.tokenise + @words_counter = tokeniser.tokenise(exclude: Entry::WORDS_NOT_TO_COUNT) if @total_count > 20 all_user_entry_count = Entry.where("date >= '#{@year}-01-01'::DATE AND date <= '#{@year}-12-31'::DATE").group(:user_id).reorder("count_all").count.values @pctile = (((all_user_entry_count.find_index(@total_count) + 1).to_f / all_user_entry_count.count) * 100).round diff --git a/app/models/entry.rb b/app/models/entry.rb index cef8b040..cd6cd4e6 100644 --- a/app/models/entry.rb +++ b/app/models/entry.rb @@ -6,7 +6,8 @@ class Entry < ActiveRecord::Base include Entry::AiAssistant mount_uploader :image, ImageUploader - WORDS_NOT_TO_COUNT = ['so', 'went', 'while', 's', 'amp', '-', 'p', 'br', 'div', 'img', 'span', 'the', 'of', 'and', 'a', 'to', 'in', 'is', 'that', 'it', 'was', 'for', 'on', 'are', 'as', 'with', 'at', 'be', 'this', 'have', 'from', 'or', 'had', 'by', 'but', 'not', 'what', 'all', 'were', 'when', 'can', 'said', 'there', 'use', 'an', 'each', 'which', 'do', 'how', 'if'] + WORDS_NOT_TO_COUNT = ['s', 'amp', '-', 'p', 'br', 'div', 'img', 'span', 'hr', '<', '>'] + COMMON_WORDS = WORDS_NOT_TO_COUNT + ['you', 'so', 'went', 'while', 's', 'amp', '-', 'p', 'br', 'div', 'img', 'span', 'the', 'of', 'and', 'a', 'to', 'in', 'is', 'that', 'it', 'was', 'for', 'on', 'are', 'as', 'with', 'at', 'be', 'this', 'have', 'from', 'or', 'had', 'by', 'but', 'not', 'what', 'all', 'were', 'when', 'can', 'said', 'there', 'use', 'an', 'each', 'which', 'do', 'how', 'if'] belongs_to :user belongs_to :inspiration, optional: true diff --git a/app/views/entries/review.html.haml b/app/views/entries/review.html.haml index 9609a4f6..6e2385dd 100644 --- a/app/views/entries/review.html.haml +++ b/app/views/entries/review.html.haml @@ -157,7 +157,7 @@ %span= "We're all still journaling in first-person...we collectively used \"i\" 203,180 times in our entries!".html_safe .s-review-details - grouped_words = @words_counter.group_by(&:itself).transform_values(&:count).sort_by { |_k, v| v }.reverse.to_h - - grouped_words = grouped_words.select { |word, count| !Entry::WORDS_NOT_TO_COUNT.include?(word) } + - grouped_words = grouped_words.select { |word, count| !Entry::COMMON_WORDS.include?(word) } - grouped_words.first(25).each do |word, count| %h3 = word diff --git a/lib/tasks/entry.rake b/lib/tasks/entry.rake index 82cce409..543a593a 100644 --- a/lib/tasks/entry.rake +++ b/lib/tasks/entry.rake @@ -139,14 +139,21 @@ namespace :entry do all_entries = Entry.where("date >= '#{year}-01-01'::DATE AND date <= '#{year}-12-31'::DATE") entries_bodies = all_entries.map { |e| ActionView::Base.full_sanitizer.sanitize(e.body) }.join(" ") - tokeniser = WordsCounted::Tokeniser.new(entries_bodies) - total_words = tokeniser.tokenise.length.to_f + tokenizer = WordsCounted::Tokeniser.new(entries_bodies).tokenise(exclude: Entry::WORDS_NOT_TO_COUNT) + total_words = tokenizer.count + + counter = WordsCounted.count(entries_bodies) + most_frequent = counter.token_frequency.first(400).select { |w| !Entry::COMMON_WORDS.include?(w[0]) }.first(40).map { |w| "#{w[0]}: #{number_with_delimiter(w[1])}" } avg_words = total_words / all_entries.count total_chars = entries_bodies.length avg_chars = total_chars / all_entries.count avg_tweets_per_post = ((avg_chars).to_f / 280).ceil - most_frequent = words_counter.token_frequency.first(40).map { |w| "#{w[0]}: #{number_with_delimiter(w[1])}" } + + grouped_words = total_words.group_by(&:itself).transform_values(&:count).sort_by { |_k, v| v }.reverse.to_h + grouped_words = grouped_words.select { |word, count| !Entry::WORDS_NOT_TO_COUNT.include?(word) } + grouped_words.first(25) + p "Users created: #{number_with_delimiter(User.where("created_at >= '#{year}-01-01'::DATE AND created_at <= '#{year}-12-31'::DATE").count)}" p "Entries created in #{year}: #{number_with_delimiter(Entry.where("created_at >= '#{year}-01-01'::DATE AND created_at <= '#{year}-12-31'::DATE").count)}" p "Entries for #{year}: #{number_with_delimiter(all_entries.count)}"