diff --git a/public/combined_analysis.json b/public/combined_analysis.json index 04294c5..b2b4a28 100644 --- a/public/combined_analysis.json +++ b/public/combined_analysis.json @@ -1,94 +1,204 @@ { "human": { - "lexical_diversity_ranges": { - "0.0": 0, - "0.1": 1, - "0.15": 1, - "0.2": 12, - "0.25": 80, - "0.3": 216, - "0.35": 401, - "0.4": 919, - "0.45": 1409, - "0.5": 2011, - "0.55": 2905, - "0.6": 2277, - "0.65": 3499, - "0.7": 2257, - "0.75": 2462, - "0.8": 2158, - "0.85": 1737, - "0.9": 1553, - "0.95": 184, - "1.0": 918 + "lexical_diversity": { + "0.0": 1, + "0.2": 9, + "0.25": 52, + "0.3": 130, + "0.35": 291, + "0.4": 684, + "0.45": 1203, + "0.5": 1817, + "0.55": 2824, + "0.6": 2203, + "0.65": 3438, + "0.7": 2367, + "0.75": 2649, + "0.8": 2351, + "0.85": 1981, + "0.9": 1774, + "0.95": 212, + "1.0": 1014 }, - "avg_sentence_length_ranges": { - "0": 68, - "5": 1262, - "10": 4062, - "15": 6919, - "20": 6054, - "25": 3390, - "30": 1568, - "35": 687, - "40": 355, - "45": 221, - "50": 127, - "55": 58, - "60": 38, - "65": 51, - "70": 26, - "75": 29, - "80": 16, + "average_word_count": { + "0": 72, + "5": 1286, + "10": 4055, + "15": 6807, + "20": 6142, + "25": 3384, + "30": 1527, + "35": 726, + "40": 362, + "45": 206, + "50": 128, + "55": 76, + "60": 51, + "65": 48, + "70": 25, + "75": 18, + "80": 11, "85": 18, - "90": 7, - "95": 5 + "90": 9, + "95": 8, + "100": 2 + }, + "average_word_length": { + "1.0": 1, + "1.2": 1, + "1.8": 3, + "2.0": 11, + "2.2": 20, + "2.4": 34, + "2.6": 107, + "2.8": 155, + "3.0": 433, + "3.2": 845, + "3.4": 1943, + "3.6": 3945, + "3.8": 4803, + "4.0": 5107, + "4.2": 3490, + "4.4": 2037, + "4.6": 967, + "4.8": 459, + "5.0": 289, + "5.2": 159, + "5.4": 90, + "5.6": 49, + "5.8": 13, + "6.0": 15, + "6.2": 5, + "6.4": 4, + "6.6": 5, + "6.8": 1, + "7.0": 3, + "7.2": 1, + "8.0": 1, + "8.2": 3, + "9.2": 1, + "10.0": 0 + }, + "flesch_kincaid_grade": { + "0": 238, + "1": 217, + "2": 416, + "3": 696, + "4": 1162, + "5": 1711, + "6": 2231, + "7": 2880, + "8": 2997, + "9": 2854, + "10": 2509, + "11": 1906, + "12": 1406, + "13": 991, + "14": 717, + "15": 512, + "16": 383, + "17": 248, + "18": 926 } }, "ai": { - "lexical_diversity_ranges": { - "0.0": 16, - "0.15": 1, - "0.2": 12, - "0.25": 105, - "0.3": 572, - "0.35": 1570, - "0.4": 4369, - "0.45": 6360, - "0.5": 5631, - "0.55": 3737, - "0.6": 1163, - "0.65": 781, - "0.7": 167, - "0.75": 114, - "0.8": 96, - "0.85": 94, - "0.9": 114, - "0.95": 18, - "1.0": 80 + "lexical_diversity": { + "0.0": 18, + "0.2": 11, + "0.25": 70, + "0.3": 371, + "0.35": 1053, + "0.4": 3478, + "0.45": 5995, + "0.5": 6066, + "0.55": 4599, + "0.6": 1511, + "0.65": 1051, + "0.7": 228, + "0.75": 141, + "0.8": 95, + "0.85": 98, + "0.9": 117, + "0.95": 17, + "1.0": 81 }, - "avg_sentence_length_ranges": { + "average_word_count": { "0": 48, - "5": 78, - "10": 272, - "15": 2567, - "20": 7042, - "25": 6410, - "30": 3355, - "35": 2040, + "5": 81, + "10": 266, + "15": 2569, + "20": 7003, + "25": 6443, + "30": 3362, + "35": 2015, "40": 1274, - "45": 732, - "50": 393, - "55": 254, - "60": 154, - "65": 79, - "70": 67, + "45": 721, + "50": 414, + "55": 257, + "60": 160, + "65": 84, + "70": 64, "75": 54, - "80": 39, - "85": 24, - "90": 20, - "95": 20, + "80": 41, + "85": 22, + "90": 21, + "95": 19, "100": 3 + }, + "average_word_length": { + "0.0": 18, + "2.4": 1, + "2.6": 2, + "2.8": 9, + "3.0": 21, + "3.2": 105, + "3.4": 344, + "3.6": 1426, + "3.8": 2693, + "4.0": 4479, + "4.2": 5181, + "4.4": 4855, + "4.6": 3160, + "4.8": 1551, + "5.0": 728, + "5.2": 269, + "5.4": 83, + "5.6": 28, + "5.8": 18, + "6.0": 8, + "6.2": 4, + "6.4": 1, + "6.6": 1, + "6.8": 4, + "7.0": 1, + "7.2": 2, + "7.6": 2, + "7.8": 1, + "8.0": 1, + "8.4": 1, + "8.6": 1, + "10.0": 0 + }, + "flesch_kincaid_grade": { + "0": 19, + "1": 2, + "2": 12, + "3": 44, + "4": 87, + "5": 235, + "6": 482, + "7": 906, + "8": 1458, + "9": 2055, + "10": 2534, + "11": 2772, + "12": 2636, + "13": 2268, + "14": 1989, + "15": 1542, + "16": 1234, + "17": 973, + "18": 3752 } } } \ No newline at end of file diff --git a/src/analysis/analyse.py b/src/analysis/analyse.py index 26de620..60b988b 100644 --- a/src/analysis/analyse.py +++ b/src/analysis/analyse.py @@ -4,7 +4,7 @@ from nltk.tokenize import sent_tokenize, word_tokenize from tqdm import tqdm - +# Analysis functions def count_words(text): return len(word_tokenize(text)) @@ -13,139 +13,139 @@ def count_sentences(text): return len(sent_tokenize(text)) -def average_word_count(text): - word_count = count_words(text) - sentence_count = count_sentences(text) +def count_syllables(word): + word = word.lower() + syllable_count = 0 + vowels = "aeiouy" + if word[0] in vowels: + syllable_count += 1 + for index in range(1, len(word)): + if word[index] in vowels and word[index - 1] not in vowels: + syllable_count += 1 + if word.endswith("e"): + syllable_count -= 1 + if syllable_count == 0: + syllable_count = 1 + return syllable_count - if sentence_count == 0: + +def total_syllables(text): + words = word_tokenize(text) + return sum(count_syllables(word) for word in words) + + +def average_word_length(text): + words = word_tokenize(text) + word_lengths = [len(word) for word in words] + if len(word_lengths) == 0: return 0 + return round(sum(word_lengths) / len(word_lengths), 2) + +def average_word_count(word_count, sentence_count): + if sentence_count == 0: + return 0 return round(word_count / sentence_count, 2) -def lexical_diversity(text): - words = word_tokenize(text.lower()) +def lexical_diversity(words): if len(words) == 0: return 0 return round(len(set(words)) / len(words), 2) -def analyse_text(text): - avg_word_count = average_word_count(text) - lex_diversity = lexical_diversity(text) - - return { - "average_word_count": avg_word_count, - "lexical_diversity": lex_diversity - } +def flesch_kincaid_grade(word_count, sentence_count, syllable_count): + if word_count == 0 or sentence_count == 0: + return 0 + grade_level = 0.39 * (word_count / sentence_count) + \ + 11.8 * (syllable_count / word_count) - 15.59 + # Ensure grade level is between 0 and 18 + return max(0, min(round(grade_level, 2), 18)) + + +class TextAnalysis: + def __init__(self, text): + self.text = text + self.words = word_tokenize(text) + self.word_count = len(self.words) + self.sentence_count = count_sentences(text) + self.syllable_count = total_syllables(text) + self.average_word_length = average_word_length(text) + + def analyze(self): + return { + "average_word_length": self.average_word_length, + "average_word_count": average_word_count(self.word_count, self.sentence_count), + "lexical_diversity": lexical_diversity(self.words), + "flesch_kincaid_grade": flesch_kincaid_grade(self.word_count, self.sentence_count, self.syllable_count) + } -def export_to_json(file_name, stats): +def export_to_json(file_name, data): with open(file_name, mode='w', encoding='utf-8') as file: - json.dump(stats, file, ensure_ascii=False, indent=4) + json.dump(data, file, ensure_ascii=False, indent=4) -def create_frequency_ranges(stats, min_val, max_val, range_size): - # Initialise the frequency ranges +def create_sorted_frequency_ranges(stats, min_val, max_val, range_size): frequency_ranges = {} - - # Loop through the stats and assign each value to a range for stat in stats: - # If the stat exceeds the maximum value, ignore it if stat > max_val: continue - - # Calculate the range index for the current stat range_index = int((stat - min_val) / range_size) range_start = min_val + range_index * range_size range_label = f"{round(range_start, 2)}" + frequency_ranges[range_label] = frequency_ranges.get( + range_label, 0) + 1 + return dict(sorted(frequency_ranges.items(), key=lambda item: float(item[0]))) + - if range_label not in frequency_ranges: - frequency_ranges[range_label] = 1 - else: - frequency_ranges[range_label] += 1 +def process_responses(responses, desc): + analyses = {"lexical_diversity": [], + "average_word_count": [], "average_word_length": [], "flesch_kincaid_grade": []} + for answer in tqdm(responses, desc=desc): + analysis = TextAnalysis(answer).analyze() + for key, value in analysis.items(): + analyses[key].append(value) + return analyses - return frequency_ranges def main(): - file_path = os.path.join(os.path.dirname(__file__), "all.jsonl") + config = { + "input_file": "all.jsonl", + "output_file": "combined_analysis.json", + "sample_size": 25000, + "range_settings": { + "lexical_diversity": {"min_val": 0, "max_val": 1.05, "range_size": 0.05}, + "average_word_count": {"min_val": 0, "max_val": 100, "range_size": 5}, + "average_word_length": {"min_val": 0, "max_val": 10, "range_size": 0.2}, + "flesch_kincaid_grade": {"min_val": 0, "max_val": 18, "range_size": 1}, + } + } + + file_path = os.path.join(os.path.dirname(__file__), config["input_file"]) with open(file_path, "r", encoding="utf-8") as file: data = [json.loads(line) for line in file] - human_lex_diversity = [] - human_avg_sentence_length = [] - ai_lex_diversity = [] - ai_avg_sentence_length = [] - - human_responses = [] - ai_responses = [] - - # Separate human and AI responses - for entry in data: - human_answers = entry["human_answers"] - chatgpt_answers = entry["chatgpt_answers"] - - human_responses.extend(human_answers) - ai_responses.extend(chatgpt_answers) - - # Randomly select 25,000 entries for human and AI responses - human_selected = random.sample(human_responses, 25000) - ai_selected = random.sample(ai_responses, 25000) - - # Analyze human responses - for answer in tqdm(human_selected, desc="Analysing human responses"): - analysis = analyse_text(answer) - human_lex_diversity.append(analysis["lexical_diversity"]) - human_avg_sentence_length.append(analysis["average_word_count"]) - - # Analyze AI responses - for answer in tqdm(ai_selected, desc="Analysing AI responses"): - analysis = analyse_text(answer) - ai_lex_diversity.append(analysis["lexical_diversity"]) - ai_avg_sentence_length.append(analysis["average_word_count"]) - - # Range sizes for lexical diversity and average sentence length - lex_div_range_size = 0.05 - avg_sent_len_range_size = 5 - - # Create frequency ranges for lexical diversity - human_lex_diversity_ranges = create_frequency_ranges( - human_lex_diversity, 0, 1.05, lex_div_range_size) - ai_lex_diversity_ranges = create_frequency_ranges( - ai_lex_diversity, 0, 1.05, lex_div_range_size) - - # Create frequency ranges for average sentence length - human_avg_sentence_length_ranges = create_frequency_ranges( - human_avg_sentence_length, 0, 100, avg_sent_len_range_size) - ai_avg_sentence_length_ranges = create_frequency_ranges( - ai_avg_sentence_length, 0, 100, avg_sent_len_range_size) - - # Sort frequency ranges for lexical diversity - sorted_human_lex_diversity_ranges = dict(sorted( - human_lex_diversity_ranges.items(), key=lambda item: float(item[0].split()[0]))) - sorted_ai_lex_diversity_ranges = dict(sorted( - ai_lex_diversity_ranges.items(), key=lambda item: float(item[0].split()[0]))) - - # Sort frequency ranges for average sentence length - sorted_human_avg_sentence_length_ranges = dict(sorted( - human_avg_sentence_length_ranges.items(), key=lambda item: float(item[0].split()[0]))) - sorted_ai_avg_sentence_length_ranges = dict(sorted( - ai_avg_sentence_length_ranges.items(), key=lambda item: float(item[0].split()[0]))) - - # Combine all the analysis data into a single dictionary - combined_data = { - "human": { - "lexical_diversity_ranges": sorted_human_lex_diversity_ranges, - "avg_sentence_length_ranges": sorted_human_avg_sentence_length_ranges - }, - "ai": { - "lexical_diversity_ranges": sorted_ai_lex_diversity_ranges, - "avg_sentence_length_ranges": sorted_ai_avg_sentence_length_ranges - } - } + human_responses = [ + response for entry in data for response in entry["human_answers"]] + ai_responses = [ + response for entry in data for response in entry["chatgpt_answers"]] + + human_selected = random.sample(human_responses, config["sample_size"]) + ai_selected = random.sample(ai_responses, config["sample_size"]) + + human_analyses = process_responses( + human_selected, "Analysing human responses") + ai_analyses = process_responses(ai_selected, "Analysing AI responses") + + combined_data = {"human": {}, "ai": {}} + for key, settings in config["range_settings"].items(): + combined_data["human"][key] = create_sorted_frequency_ranges( + human_analyses[key], **settings) + combined_data["ai"][key] = create_sorted_frequency_ranges( + ai_analyses[key], **settings) - # Export combined data to a JSON file - export_to_json("combined_analysis.json", combined_data) + export_to_json(config["output_file"], combined_data) if __name__ == "__main__": diff --git a/src/analysis/combined_analysis.json b/src/analysis/combined_analysis.json index 03ec436..455c6b9 100644 --- a/src/analysis/combined_analysis.json +++ b/src/analysis/combined_analysis.json @@ -1,93 +1,202 @@ { "human": { - "lexical_diversity_ranges": { - "0.1": 1, - "0.15": 1, - "0.2": 12, - "0.25": 80, - "0.3": 216, - "0.35": 401, - "0.4": 919, - "0.45": 1409, - "0.5": 2011, - "0.55": 2905, - "0.6": 2277, - "0.65": 3499, - "0.7": 2257, - "0.75": 2462, - "0.8": 2158, - "0.85": 1737, - "0.9": 1553, - "0.95": 184, - "1.0": 918 + "lexical_diversity": { + "0.0": 1, + "0.2": 9, + "0.25": 52, + "0.3": 130, + "0.35": 291, + "0.4": 684, + "0.45": 1203, + "0.5": 1817, + "0.55": 2824, + "0.6": 2203, + "0.65": 3438, + "0.7": 2367, + "0.75": 2649, + "0.8": 2351, + "0.85": 1981, + "0.9": 1774, + "0.95": 212, + "1.0": 1014 }, - "avg_sentence_length_ranges": { - "0": 68, - "5": 1262, - "10": 4062, - "15": 6919, - "20": 6054, - "25": 3390, - "30": 1568, - "35": 687, - "40": 355, - "45": 221, - "50": 127, - "55": 58, - "60": 38, - "65": 51, - "70": 26, - "75": 29, - "80": 16, + "average_word_count": { + "0": 72, + "5": 1286, + "10": 4055, + "15": 6807, + "20": 6142, + "25": 3384, + "30": 1527, + "35": 726, + "40": 362, + "45": 206, + "50": 128, + "55": 76, + "60": 51, + "65": 48, + "70": 25, + "75": 18, + "80": 11, "85": 18, - "90": 7, - "95": 5 + "90": 9, + "95": 8, + "100": 2 + }, + "average_word_length": { + "1.0": 1, + "1.2": 1, + "1.8": 3, + "2.0": 11, + "2.2": 20, + "2.4": 34, + "2.6": 107, + "2.8": 155, + "3.0": 433, + "3.2": 845, + "3.4": 1943, + "3.6": 3945, + "3.8": 4803, + "4.0": 5107, + "4.2": 3490, + "4.4": 2037, + "4.6": 967, + "4.8": 459, + "5.0": 289, + "5.2": 159, + "5.4": 90, + "5.6": 49, + "5.8": 13, + "6.0": 15, + "6.2": 5, + "6.4": 4, + "6.6": 5, + "6.8": 1, + "7.0": 3, + "7.2": 1, + "8.0": 1, + "8.2": 3, + "9.2": 1 + }, + "flesch_kincaid_grade": { + "0": 238, + "1": 217, + "2": 416, + "3": 696, + "4": 1162, + "5": 1711, + "6": 2231, + "7": 2880, + "8": 2997, + "9": 2854, + "10": 2509, + "11": 1906, + "12": 1406, + "13": 991, + "14": 717, + "15": 512, + "16": 383, + "17": 248, + "18": 926 } }, "ai": { - "lexical_diversity_ranges": { - "0.0": 16, - "0.15": 1, - "0.2": 12, - "0.25": 105, - "0.3": 572, - "0.35": 1570, - "0.4": 4369, - "0.45": 6360, - "0.5": 5631, - "0.55": 3737, - "0.6": 1163, - "0.65": 781, - "0.7": 167, - "0.75": 114, - "0.8": 96, - "0.85": 94, - "0.9": 114, - "0.95": 18, - "1.0": 80 + "lexical_diversity": { + "0.0": 18, + "0.2": 11, + "0.25": 70, + "0.3": 371, + "0.35": 1053, + "0.4": 3478, + "0.45": 5995, + "0.5": 6066, + "0.55": 4599, + "0.6": 1511, + "0.65": 1051, + "0.7": 228, + "0.75": 141, + "0.8": 95, + "0.85": 98, + "0.9": 117, + "0.95": 17, + "1.0": 81 }, - "avg_sentence_length_ranges": { + "average_word_count": { "0": 48, - "5": 78, - "10": 272, - "15": 2567, - "20": 7042, - "25": 6410, - "30": 3355, - "35": 2040, + "5": 81, + "10": 266, + "15": 2569, + "20": 7003, + "25": 6443, + "30": 3362, + "35": 2015, "40": 1274, - "45": 732, - "50": 393, - "55": 254, - "60": 154, - "65": 79, - "70": 67, + "45": 721, + "50": 414, + "55": 257, + "60": 160, + "65": 84, + "70": 64, "75": 54, - "80": 39, - "85": 24, - "90": 20, - "95": 20, + "80": 41, + "85": 22, + "90": 21, + "95": 19, "100": 3 + }, + "average_word_length": { + "0.0": 18, + "2.4": 1, + "2.6": 2, + "2.8": 9, + "3.0": 21, + "3.2": 105, + "3.4": 344, + "3.6": 1426, + "3.8": 2693, + "4.0": 4479, + "4.2": 5181, + "4.4": 4855, + "4.6": 3160, + "4.8": 1551, + "5.0": 728, + "5.2": 269, + "5.4": 83, + "5.6": 28, + "5.8": 18, + "6.0": 8, + "6.2": 4, + "6.4": 1, + "6.6": 1, + "6.8": 4, + "7.0": 1, + "7.2": 2, + "7.6": 2, + "7.8": 1, + "8.0": 1, + "8.4": 1, + "8.6": 1 + }, + "flesch_kincaid_grade": { + "0": 19, + "1": 2, + "2": 12, + "3": 44, + "4": 87, + "5": 235, + "6": 482, + "7": 906, + "8": 1458, + "9": 2055, + "10": 2534, + "11": 2772, + "12": 2636, + "13": 2268, + "14": 1989, + "15": 1542, + "16": 1234, + "17": 973, + "18": 3752 } } } diff --git a/src/analysis/distribution_graphs.png b/src/analysis/distribution_graphs.png index 0c364c3..c4c8052 100644 Binary files a/src/analysis/distribution_graphs.png and b/src/analysis/distribution_graphs.png differ diff --git a/src/analysis/graph.py b/src/analysis/graph.py index 86ffd97..a148c0b 100644 --- a/src/analysis/graph.py +++ b/src/analysis/graph.py @@ -24,7 +24,6 @@ def prepare_data(stats): return np.array(values) - def plot_distribution(human_data, ai_data, title, ax): sns.kdeplot(human_data, label='Human', ax=ax, color='blue', fill=True) sns.kdeplot(ai_data, label='AI', ax=ax, color='red', fill=True) @@ -37,10 +36,14 @@ def main(): combined_data = read_stats('combined_analysis.json') # Extract necessary statistics - human_lexical_diversity_stats = combined_data["human"]["lexical_diversity_ranges"] - ai_lexical_diversity_stats = combined_data["ai"]["lexical_diversity_ranges"] - human_avg_sentence_length_stats = combined_data["human"]["avg_sentence_length_ranges"] - ai_avg_sentence_length_stats = combined_data["ai"]["avg_sentence_length_ranges"] + human_lexical_diversity_stats = combined_data["human"]["lexical_diversity"] + ai_lexical_diversity_stats = combined_data["ai"]["lexical_diversity"] + human_avg_sentence_length_stats = combined_data["human"]["average_word_count"] + ai_avg_sentence_length_stats = combined_data["ai"]["average_word_count"] + human_avg_word_length_stats = combined_data["human"]["average_word_length"] + ai_avg_word_length_stats = combined_data["ai"]["average_word_length"] + human_fk_grade_stats = combined_data["human"]["flesch_kincaid_grade"] + ai_fk_grade_stats = combined_data["ai"]["flesch_kincaid_grade"] # Prepare data for plotting human_lexical_diversity_data = prepare_data(human_lexical_diversity_stats) @@ -48,14 +51,22 @@ def main(): human_avg_sentence_length_data = prepare_data( human_avg_sentence_length_stats) ai_avg_sentence_length_data = prepare_data(ai_avg_sentence_length_stats) + human_avg_word_length_data = prepare_data(human_avg_word_length_stats) + ai_avg_word_length_data = prepare_data(ai_avg_word_length_stats) + human_fk_grade_data = prepare_data(human_fk_grade_stats) + ai_fk_grade_data = prepare_data(ai_fk_grade_stats) # Plot distributions - fig, axes = plt.subplots(1, 2, figsize=(18, 6)) + fig, axes = plt.subplots(1, 4, figsize=(36, 6)) plot_distribution(human_lexical_diversity_data, ai_lexical_diversity_data, 'Lexical Diversity', axes[0]) plot_distribution(human_avg_sentence_length_data, ai_avg_sentence_length_data, 'Average Sentence Length', axes[1]) + plot_distribution(human_avg_word_length_data, + ai_avg_word_length_data, 'Average Word Length', axes[2]) + plot_distribution(human_fk_grade_data, + ai_fk_grade_data, 'Flesch-Kincaid Grade Level', axes[3]) plt.tight_layout() plt.savefig('distribution_graphs.png') diff --git a/src/app/page.tsx b/src/app/page.tsx index 11c1a0b..5a651e8 100644 --- a/src/app/page.tsx +++ b/src/app/page.tsx @@ -29,8 +29,6 @@ export default function Home() { window.location.reload(); }; - console.log(process.env.NEXT_PUBLIC_URL); - return (