From 8a57df9386c96cd41f657f304e37887023a72880 Mon Sep 17 00:00:00 2001
From: Jonathan Shi <jonathan.shi@snowflake.com>
Date: Thu, 29 Aug 2024 13:15:57 -0700
Subject: [PATCH] fix as_index and doctests

---
 .../plugin/compiler/snowflake_query_compiler.py   | 15 ++++++++-------
 .../snowpark/modin/plugin/docstrings/groupby.py   | 12 ++++++------
 tests/integ/modin/groupby/test_value_counts.py    | 15 +++++++++++++++
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
index 66ed9bf232a..7e03986036c 100644
--- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
+++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -5040,10 +5040,7 @@ def groupby_value_counts(
             # unlike pandas, we do not include the offending labels in the error message.
             raise ValueError("Keys in subset cannot be in the groupby column keys")
         if subset is not None:
-            if not isinstance(subset, (list, tuple)):
-                subset_list = [subset]
-            else:
-                subset_list = subset
+            subset_list = subset
         else:
             # If subset is unspecified, then all columns should be included.
             subset_list = self._modin_frame.data_column_pandas_labels
@@ -5150,10 +5147,14 @@ def groupby_value_counts(
             # the count/proportion column. The left-most column (nearest to the grouping columns
             # is sorted on last).
             # Exclude the grouping columns (always the first) from the sort.
+            if as_index:
+                # When as_index is true, the non-grouping columns are part of the index columns
+                columns_to_filter = result._modin_frame.index_column_pandas_labels
+            else:
+                # When as_index is false, the non-grouping columns are part of the data columns
+                columns_to_filter = result._modin_frame.data_column_pandas_labels
             non_grouping_cols = [
-                col_label
-                for col_label in result._modin_frame.index_column_pandas_labels
-                if col_label not in by
+                col_label for col_label in columns_to_filter if col_label not in by
             ]
             sort_cols.extend(non_grouping_cols)
             ascending_cols.extend([True] * len(non_grouping_cols))
diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py
index 70d8d568493..46fab52f85a 100644
--- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py
+++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py
@@ -257,8 +257,8 @@ def value_counts():
 
         >>> df.groupby('gender').value_counts()  # doctest: +NORMALIZE_WHITESPACE
         gender  education  country
-        female  high       US         1
-                           FR         1
+        female  high       FR         1
+                           US         1
         male    low        FR         2
                            US         1
                 medium     FR         1
@@ -266,8 +266,8 @@ def value_counts():
 
         >>> df.groupby('gender').value_counts(ascending=True)  # doctest: +NORMALIZE_WHITESPACE
         gender  education  country
-        female  high       US         1
-                           FR         1
+        female  high       FR         1
+                           US         1
         male    low        US         1
                 medium     FR         1
                 low        FR         2
@@ -275,8 +275,8 @@ def value_counts():
 
         >>> df.groupby('gender').value_counts(normalize=True)  # doctest: +NORMALIZE_WHITESPACE
         gender  education  country
-        female  high       US         0.50
-                           FR         0.50
+        female  high       FR         0.50
+                           US         0.50
         male    low        FR         0.50
                            US         0.25
                 medium     FR         0.25
diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py
index ae4a14ba88b..1f1b2f5c052 100644
--- a/tests/integ/modin/groupby/test_value_counts.py
+++ b/tests/integ/modin/groupby/test_value_counts.py
@@ -131,6 +131,21 @@ def test_value_counts_normalize(
     )
 
 
+@pytest.mark.parametrize("test_data", TEST_DATA)
+@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]])
+@pytest.mark.parametrize("groupby_sort", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("as_index", [True, False])
+@sql_count_checker(query_count=1)
+def test_value_counts_as_index(test_data, by, groupby_sort, sort, as_index):
+    eval_snowpark_pandas_result(
+        *create_test_dfs(test_data),
+        lambda df: df.groupby(by=by, sort=groupby_sort, as_index=as_index).value_counts(
+            sort=sort
+        ),
+    )
+
+
 @pytest.mark.parametrize(
     "subset, exception_cls",
     [