From e127138e4d3600ad994858f32cb58aac90d82a88 Mon Sep 17 00:00:00 2001
From: Glenn Rice <grice1@missouriwestern.edu>
Date: Sun, 20 Aug 2023 10:23:32 -0500
Subject: [PATCH] Rewrite the dump_past_answers script (renamed to
 dump-past-answers.pl).

The script no longer saves ids (course id, set id, user id, etc)  as
useless hashes.  I don't know why it was thought that doing so was a
good idea.

The script does not gzip the resulting csv file.  You can do that on
your own if so desired.

The script does not have an upload option.  You can upload it to where
you want on your own if so desired.

The unix timestamp for the time at which the script is executed is no
longer saved in each row of the CSV file.  It is part of the default
file name if you want it.

Past answers from gateway quiz versions are also dumped.  I think it was
intended that they would be dumped before, but it wasn't working.

Past answers from problems with more than one answer in them are now
dumped.  It certainly was intended that this be done before, but wasn't
because of a bug in the script.
---
 bin/dump-past-answers.pl | 299 +++++++++++++++++++++++++++++++++++
 bin/dump_past_answers    | 329 ---------------------------------------
 2 files changed, 299 insertions(+), 329 deletions(-)
 create mode 100755 bin/dump-past-answers.pl
 delete mode 100755 bin/dump_past_answers
diff --git a/bin/dump-past-answers.pl b/bin/dump-past-answers.pl
new file mode 100755
index 0000000000..d4dc3082a7
--- /dev/null
+++ b/bin/dump-past-answers.pl
@@ -0,0 +1,299 @@
+#!/usr/bin/env perl
+################################################################################
+# WeBWorK Online Homework Delivery System
+# Copyright &copy; 2000-2023 The WeBWorK Project, https://github.com/openwebwork
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of either: (a) the GNU General Public License as published by the
+# Free Software Foundation; either version 2, or (at your option) any later
+# version, or (b) the "Artistic License" which comes with this package.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See either the GNU General Public License or the
+# Artistic License for more details.
+################################################################################
+
+=head1 NAME
+
+dump-past-answers.pl: This script dumps past answers from courses into a CSV
+file.
+
+=head1 SYNOPSIS
+
+dump-past-answers.pl [options]
+
+    Options:
+        -c|--course          Course from which to dump past answers
+        -f|--output-file     CSV file name to dump past answers to
+        -h|--help            Show this help
+
+The C<course> option can be repeated multiple times to dump past answers from
+multiple courses into the same file.  If no courses are given via this option,
+then past answers from all courses will be dumped.
+
+If the C<output-file> option is not given then
+C<past-answers-$current_unix_time.csv> will be used for the output file name.
+
+=head1 DESCRIPTION
+
+The CSV file that is generated has the following columns:
+
+ID info
+
+    0 - Answer ID
+    1 - Course ID
+    2 - Student ID
+    3 - Set ID
+    4 - Problem ID
+
+User Info
+
+    5 - Permission Level
+    6 - User Course Status
+
+Set Info
+
+    7 - Set type
+    8 - Open Date (unix time)
+    9 - Due Date (unix time)
+    10 - Answer Date (unix time)
+    11 - Final Set Grade (percentage)
+
+Problem Info
+
+    12 - Problem Path
+    13 - Problem Value
+    14 - Problem Max Attempts
+    15 - Problem Seed
+    16 - Attempted
+    17 - Final Incorrect Attempts
+    18 - Final Correct Attempts
+    19 - Final Status
+
+OPL Info
+
+    20 - Subject
+    21 - Chapter
+    22 - Section
+    23 - Keywords
+
+Answer Info
+
+    24 - Answer timestamp (unix time)
+    25 - Attempt Number
+    26 - Raw status of attempt (percentage of correct blanks)
+    27 - Number of Answer Blanks
+    28/29 etc... - The following columns will come in pairs.  The first will be
+                   the text of the answer contained in the answer blank
+                   and the second will be the binary 0/1 status of the answer
+                   blank.  There will be as many pairs as answer blanks.
+
+=cut
+
+use strict;
+use warnings;
+use feature 'say';
+
+BEGIN {
+	use Mojo::File qw(curfile);
+	use Env qw(WEBWORK_ROOT);
+	$WEBWORK_ROOT = curfile->dirname->dirname;
+}
+
+use lib "$ENV{WEBWORK_ROOT}/lib";
+
+use Getopt::Long qw(:config bundling);
+use Pod::Usage;
+use Text::CSV;
+
+use WeBWorK::CourseEnvironment;
+use WeBWorK::DB;
+use WeBWorK::Utils::CourseManagement qw(listCourses);
+use WeBWorK::Utils::Tags;
+
+# Get options.
+my @courses;
+my $output_file = "past-answers-" . time . ".csv";
+my $show_help;
+GetOptions('c|course=s' => \@courses, 'f|output-file=s' => \$output_file, 'h|help' => \$show_help);
+
+pod2usage(2) if $show_help;
+
+@courses = listCourses(WeBWorK::CourseEnvironment->new({ webwork_dir => $ENV{WEBWORK_ROOT} })) unless @courses;
+
+sub write_past_answers_csv {
+	my $outFH = shift;
+
+	my $csv = Text::CSV->new({ binary => 1, eol => "\n" }) or die "Cannot use CSV: " . Text::CSV->error_diag();
+
+	# Cache OPL tag data when it is looked up instead of looking up each file every time it appears as the source file
+	# for a past answer.  This considerably speeds up this script.
+	my %OPL_tag_data;
+
+	for my $courseID (@courses) {
+		next if $courseID eq 'admin' || $courseID eq 'modelCourse';
+
+		my $ce = WeBWorK::CourseEnvironment->new({ webwork_dir => $ENV{WEBWORK_ROOT}, courseName => $courseID });
+		my $db = WeBWorK::DB->new($ce->{dbLayout});
+
+		my %permissionLabels = reverse %{ $ce->{userRoles} };
+
+		unless (defined $ce && defined $db) {
+			warn("Unable to load course environment and database for $courseID");
+			next;
+		}
+
+		say "Dumping past answers for $courseID";
+
+		# Get all past answers for this course sorted by answer_id and organize them by user, set, and problem.
+		my %pastAnswers;
+		for ($db->getPastAnswersWhere({}, 'answer_id')) {
+			push(@{ $pastAnswers{ $_->user_id }{ $_->set_id }{ $_->problem_id } }, $_);
+		}
+
+		my @row;
+
+		$row[1] = $courseID;
+
+		my @users = $db->getUsersWhere({ user_id => { not_like => 'set_id:%' } });
+
+		for my $user (@users) {
+			my $userID = $user->user_id;
+
+			$row[2] = $userID;
+			$row[5] = $permissionLabels{ $db->getPermissionLevel($userID)->permission };
+			$row[6] = $ce->status_abbrev_to_name($user->{status});
+
+			my @sets;
+			for ($db->getMergedSetsWhere({ user_id => $userID }, 'set_id')) {
+				if (defined $_->assignment_type && $_->assignment_type =~ /gateway/) {
+					my $setID    = $_->set_id;
+					my @versions = $db->listSetVersions($userID, $setID);
+					for my $version (@versions) {
+						push(@sets, $db->getUserSet($userID, "$setID,v$version"));
+					}
+				} else {
+					push(@sets, $_);
+				}
+			}
+
+			for my $set (@sets) {
+				my $setID = $set->set_id;
+
+				$row[3]  = $setID;
+				$row[7]  = $set->assignment_type;
+				$row[8]  = $set->open_date;
+				$row[9]  = $set->due_date;
+				$row[10] = $set->answer_date;
+
+				my @problems =
+					$set->assignment_type =~ /gateway/
+					? $db->getMergedProblemVersionsWhere({ user_id => $userID, set_id => $setID }, 'problem_id')
+					: $db->getMergedProblemsWhere({ user_id => $userID, set_id => $setID }, 'problem_id');
+
+				# Compute set score
+				my $total   = 0;
+				my $correct = 0;
+				for my $problem (@problems) {
+					$total   += $problem->value;
+					$correct += $problem->value * $problem->status;
+				}
+				$row[11] = $total ? $correct / $total : 0;
+
+				for my $problem (@problems) {
+					my $problemID = $problem->problem_id;
+
+					$row[4]  = $problemID;
+					$row[12] = $problem->source_file;
+					$row[13] = $problem->value;
+					$row[14] = $problem->max_attempts;
+					$row[15] = $problem->problem_seed;
+					$row[16] = $problem->attempted;
+					$row[17] = $problem->num_incorrect;
+					$row[18] = $problem->num_correct;
+					$row[19] = $problem->status;
+
+					# Get OPL tag data.
+					if ($row[12]) {
+						my $file = "$ce->{courseDirs}{templates}/$row[12]";
+						$OPL_tag_data{$file} = WeBWorK::Utils::Tags->new($file)
+							if !defined $OPL_tag_data{$file} && -e $file;
+						if (defined $OPL_tag_data{$file}) {
+							$row[20] = $OPL_tag_data{$file}{DBsubject};
+							$row[21] = $OPL_tag_data{$file}{DBchapter};
+							$row[22] = $OPL_tag_data{$file}{DBsection};
+							$row[23] =
+								defined($OPL_tag_data{$file}{keywords})
+								? join(',', @{ $OPL_tag_data{$file}{keywords} })
+								: '';
+						}
+					}
+
+					my $attempt_number = 0;
+					for my $answer (@{ $pastAnswers{$userID}{$setID}{ $problem->problem_id } }) {
+						my $answerID = $answer->answer_id;
+						++$attempt_number;
+
+						# If the source file for this answer is different from that of the merged user set,
+						# then update the row and get the OPL tag data for this file.
+						if ($row[12] ne $answer->source_file) {
+							$row[12] = $answer->source_file;
+							if ($row[12]) {
+								my $file = "$ce->{courseDirs}{templates}/$row[12]";
+								$OPL_tag_data{$file} = WeBWorK::Utils::Tags->new($file)
+									if !defined $OPL_tag_data{$file} && -e $file;
+								if (defined $OPL_tag_data{$file}) {
+									$row[20] = $OPL_tag_data{$file}{DBsubject};
+									$row[21] = $OPL_tag_data{$file}{DBchapter};
+									$row[22] = $OPL_tag_data{$file}{DBsection};
+									$row[23] =
+										defined($OPL_tag_data{$file}{keywords})
+										? join(',', @{ $OPL_tag_data{$file}{keywords} })
+										: '';
+								}
+							}
+						}
+
+						# Input answer specific info
+						$row[0]  = $answerID;
+						$row[15] = $answer->problem_seed
+							if defined $answer->problem_seed && $answer->problem_seed ne '';
+						$row[24] = $answer->timestamp;
+						$row[25] = $attempt_number;
+
+						my @scores  = split('',   $answer->scores);
+						my @answers = split("\t", $answer->answer_string, -1);
+
+						# Skip answer processing if the number of scores isn't the same as the number of answers.
+						next if $#scores != $#answers;
+
+						my $num_blanks = scalar(@scores);
+
+						# Compute the raw status
+						my $score = 0;
+						for (@scores) { $score += $_ }
+						$row[26] = $num_blanks ? $score / $num_blanks : 0;
+
+						$row[27] = $num_blanks;
+
+						for (my $i = 0; $i < $num_blanks; $i++) {
+							$row[ 28 + 2 * $i ] = $answers[$i];
+							$row[ 29 + 2 * $i ] = $scores[$i];
+						}
+
+						$csv->print($outFH, \@row) or warn "Couldn't print row";
+					}
+				}
+			}
+		}
+	}
+
+	return;
+}
+
+say "Dumping answer data to $output_file";
+open(my $outFH, '>:encoding(UTF-8)', $output_file) or die("Couldn't open file $output_file");
+write_past_answers_csv($outFH);
+close($outFH) or die("Couldn't close $output_file");
+say 'Done dumping data';
diff --git a/bin/dump_past_answers b/bin/dump_past_answers
deleted file mode 100755
index 1228da7ff8..0000000000
--- a/bin/dump_past_answers
+++ /dev/null
@@ -1,329 +0,0 @@
-#!/usr/bin/env perl
-
-################################################################################
-# WeBWorK Online Homework Delivery System
-# Copyright &copy; 2000-2023 The WeBWorK Project, https://github.com/openwebwork
-#
-# This program is free software; you can redistribute it and/or modify it under
-# the terms of either: (a) the GNU General Public License as published by the
-# Free Software Foundation; either version 2, or (at your option) any later
-# version, or (b) the "Artistic License" which comes with this package.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE.  See either the GNU General Public License or the
-# Artistic License for more details.
-################################################################################
-
-# This script dumps the course information from all unarchived courses into
-# a single csv file.  The csv file has the following columns.
-#
-# ID Info
-# 0 - Answer ID hash
-# 1 - Course ID hash
-# 2 - Student ID hash
-# 3 - Set ID hash
-# 4 - Problem ID hash
-# 5 - Timestamp
-# User Info
-# 6 - Permission Level
-# 7 - Final Status
-# Set Info
-# 8 - Set type
-# 9 - Open Date (unix time)
-# 10 - Due Date (unix time)
-# 11 - Answer Date (unix time)
-# 12 - Final Set Grade (percentage)
-# Problem Info
-# 13 - Problem Path
-# 14 - Problem Value
-# 15 - Problem Max Attempts
-# 16 - Problem Seed
-# 17 - Attempted
-# 18 - Final Incorrect Attempts
-# 19 - Final Correct Attempts
-# 20 - Final Status
-# OPL Info
-# 21 - Subject
-# 22 - Chapter
-# 23 - Section
-# 24 - Keywords
-# Answer Info
-# 25 - Answer timestamp (unix time)
-# 26 - Attempt Number
-# 27 - Raw status of attempt (percentage of correct blanks)
-# 28 - Number of Answer Blanks
-# 29/30 etc... - The following columns will come in pairs.  The first will be
-#                the text of the answer contained in the answer blank
-#                and the second will be the binary 0/1 status of the answer
-#                blank.  There will be as many pairs as answer blanks.
-
-use strict;
-
-BEGIN {
-	use Mojo::File qw(curfile);
-	use Env qw(WEBWORK_ROOT);
-
-	$WEBWORK_ROOT = curfile->dirname->dirname;
-}
-
-use lib "$ENV{WEBWORK_ROOT}/lib";
-
-use WeBWorK::CourseEnvironment;
-
-use WeBWorK::DB;
-use WeBWorK::Utils::CourseIntegrityCheck;
-use WeBWorK::Utils::CourseManagement qw/listCourses/;
-use WeBWorK::Utils::Tags;
-use WeBWorK::PG;
-
-use Text::CSV;
-use Digest::SHA qw(sha256_hex);
-use Net::Domain;
-
-# Deal with options
-my $output_file;
-my $zip_result    = 1;
-my $upload_result = 0;
-
-my $domainname = Net::Domain::domainname;
-my $time       = time();
-
-# define and open the output file.
-if (!$output_file) {
-	$output_file = "$domainname-$time.csv";
-}
-
-my $salt;
-my $SALTFILE;
-my $saltfilename = $ENV{WEBWORK_ROOT} . '/.dump_past_answers_salt';
-
-if (-e $saltfilename) {
-	open($SALTFILE, '<', $saltfilename) || die("Couldn't open salt file.");
-	$salt = <$SALTFILE>;
-	close $SALTFILE;
-} else {
-	$salt = '';
-	for (my $i = 0; $i < 32; $i++) {
-		$salt .= ('.', '/', '0' .. '9', 'A' .. 'Z', 'a' .. 'z')[ rand 64 ];
-	}
-
-	open($SALTFILE, '>', $saltfilename) || die("Couldn't open salt file.");
-	print $SALTFILE $salt;
-	close $SALTFILE;
-}
-
-my $OUT;
-open($OUT, '>', $output_file) || die("Couldn't open file $output_file");
-
-print "Dumping answer data to $output_file\n";
-
-# set up various variables and utilities that we will need
-my ($db, @wheres);
-my $max_answer_blanks = 0;
-my $csv               = new Text::CSV->new({ binary => 1 })
-	or die "Cannot use CSV: " . Text::CSV->error_diag();
-$csv->eol("\n");
-
-my $ce = WeBWorK::CourseEnvironment->new({
-	webwork_dir => $ENV{WEBWORK_ROOT},
-});
-
-my @courses          = listCourses($ce);
-my %permissionLabels = reverse %{ $ce->{userRoles} };
-
-# this is our row array and is the main structure
-my @row;
-
-# go through courses
-foreach my $courseID (@courses) {
-	next if $courseID eq 'admin' || $courseID eq 'modelCourse';
-
-	$ce = WeBWorK::CourseEnvironment->new({
-		webwork_dir => $ENV{WEBWORK_ROOT},
-		courseName  => $courseID,
-	});
-	$db = new WeBWorK::DB($ce->{dbLayout});
-
-	unless (defined($ce) && defined($db)) {
-		warn("Unable to load up database for $courseID");
-		next;
-	}
-
-	print "Dumping $courseID\n";
-
-	my $templateDir = $ce->{courseDirs}->{templates};
-
-	my $sCourseID = sha256_hex($salt . $domainname . $courseID);
-
-	$row[1] = $sCourseID;
-	$row[5] = $time;
-
-	my @userIDs = $db->listUsers();
-	my @users   = $db->getUsers(@userIDs);
-
-	# go through users
-	foreach my $user (@users) {
-		my $userID = $user->user_id;
-
-		#skip proctor users
-		next if $user->user_id =~ /^set_id:/;
-
-		my $sUserID = sha256_hex($salt . $domainname . $courseID . $userID);
-
-		# get user specific info
-		$row[2] = $sUserID;
-		my $permissionLevel = $db->getPermissionLevel($userID);
-		$row[6] = $permissionLabels{ $permissionLevel->permission };
-		$row[7] = $ce->status_abbrev_to_name($user->{status});
-
-		my @setIDs = $db->listUserSets($userID);
-		@wheres = map { [ $userID, $_ ] } @setIDs;
-		my @sets = $db->getMergedSets(@wheres);
-
-		# go through sets
-		foreach my $set (@sets) {
-			# skip gateways
-			if ($set->assignment_type =~ /gateway/
-				&& $set->set_id !~ /,v\d+$/)
-			{
-				next;
-			}
-
-			my $setID  = $set->set_id;
-			my $sSetID = sha256_hex($salt . $domainname . $courseID . $setID);
-
-			# get set specific info
-			$row[3]  = $sSetID;
-			$row[8]  = $set->assignment_type;
-			$row[9]  = $set->open_date;
-			$row[10] = $set->due_date;
-			$row[11] = $set->answer_date;
-
-			my @problemIDs = $db->listUserProblems($userID, $setID);
-			@wheres = map { [ $userID, $setID, $_ ] } @problemIDs;
-			my @problems = $db->getMergedProblems(@wheres);
-
-			# compute set score
-			my $total   = 0;
-			my $correct = 0;
-			foreach my $problem (@problems) {
-				$total   += $problem->value();
-				$correct += $problem->value * $problem->status;
-			}
-			$row[12] = $total ? $correct / $total : 0;
-
-			# go through each problem
-			foreach my $problem (@problems) {
-				my $problemID  = $problem->problem_id;
-				my $sProblemID = sha256_hex($salt . $domainname . $courseID . $userID . $setID . $problemID);
-
-				# print problem specific info
-				$row[4]  = $sProblemID;
-				$row[13] = $problem->source_file;
-				$row[14] = $problem->value;
-				$row[15] = $problem->max_attempts;
-				$row[16] = $problem->problem_seed;
-				$row[17] = $problem->attempted;
-				$row[18] = $problem->num_incorrect;
-				$row[19] = $problem->num_correct;
-				$row[20] = $problem->status;
-
-				# get OPL data
-				my $file = $templateDir . '/' . $problem->source_file();
-				if (-e $file) {
-					my $tags = WeBWorK::Utils::Tags->new($file);
-					$row[21] = $tags->{DBsubject};
-					$row[22] = $tags->{DBchapter};
-					$row[23] = $tags->{DBsection};
-					$row[24] = defined($tags->{keywords}) ? join(',', @{ $tags->{keywords} }) : '';
-				}
-
-				my @answerIDs = $db->listProblemPastAnswers($userID, $setID, $problemID);
-				my @answers   = $db->getPastAnswers(\@answerIDs);
-
-				# go through attempts
-				my $attempt_number = 0;
-				foreach my $answer (@answers) {
-					#reset the row length because it can change;
-					@row = splice(@row, 0, 28);
-					my $answerID = $answer->answer_id;
-					my $sAnswerID =
-						sha256_hex($salt . $domainname . $courseID . $userID . $setID . $problemID . $answerID);
-					$attempt_number++;
-
-					# if the source file changed redo that info
-					if ($row[13] != $answer->source_file) {
-						$row[13] = $answer->source_file;
-						$file = $templateDir . '/' . $answer->source_file();
-						if (-e $file) {
-							my $tags = WeBWorK::Utils::Tags->new($file);
-							$row[21] = $tags->{DBsubject};
-							$row[22] = $tags->{DBchapter};
-							$row[23] = $tags->{DBsection};
-							$row[24] = defined($tags->{keywords}) ? join(',', @{ $tags->{keywords} }) : '';
-						}
-					}
-
-					# input answer specific info
-					$row[0]  = $sAnswerID;
-					$row[25] = $answer->timestamp;
-					$row[26] = $attempt_number;
-
-					my @scores  = split('',   $answer->scores,        -1);
-					my @answers = split("\t", $answer->answer_string, -1);
-
-					# if the number of scores isn't the same as the number of
-					# answers we should skip
-					if ($#scores != $#answers) {
-						next;
-					}
-					my $num_blanks = scalar(@scores);
-
-					$max_answer_blanks = $num_blanks
-						if ($num_blanks > $max_answer_blanks);
-
-					# compute the raw status
-					my $score = 0;
-					foreach (@scores) {
-						$score += $_;
-					}
-
-					$row[27] = $num_blanks ? $score / $num_blanks : 0;
-
-					# we leave the computed status blank for now.
-
-					$row[28] = $num_blanks;
-
-					for (my $i = 0; $i < $num_blanks; $i++) {
-						$row[ 29 + 2 * $i ] = $answers[$i];
-						$row[ 30 + 2 * $i ] = $scores[$i];
-					}
-
-					#form the csv string and print
-					$csv->print($OUT, \@row) || warn "Couldn't print row";
-				}
-			}
-		}
-	}
-}
-
-print "Done dumping data\n";
-
-close($OUT) or die("Couldn't close $output_file");
-
-if ($zip_result) {
-	print "Zipping file\n";
-
-	`gzip $output_file`;
-
-	$output_file = $output_file . ".gz";
-}
-
-if ($upload_result) {
-	print "Uploading file\n";
-
-	`echo "put $output_file" | sftp -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -oPort=57281 wwdata\@52.88.32.79`;
-}
-
-1;