From a0f36444a2904b6a42d6190753a81fd737c7ad43 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Wed, 21 Jun 2023 15:15:56 -0700 Subject: [PATCH 01/15] Test commit for advanced spark setup --- src/smspark/bootstrapper.py | 55 +++++++++++++++++++++++++++++-------- src/smspark/constants.py | 7 +++++ 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/src/smspark/bootstrapper.py b/src/smspark/bootstrapper.py index db7c758..2e83a96 100644 --- a/src/smspark/bootstrapper.py +++ b/src/smspark/bootstrapper.py @@ -388,18 +388,49 @@ def get_yarn_spark_resource_config( self, instance_count: int, instance_mem_mb: int, instance_cores: int ) -> Tuple[Configuration, Configuration]: aws_region = os.getenv("AWS_REGION") - executor_cores = instance_cores - executor_count_per_instance = int(instance_cores / executor_cores) - executor_count_total = instance_count * executor_count_per_instance - default_parallelism = instance_count * instance_cores * 2 - - driver_mem_mb = int(instance_mem_mb * constants.DRIVER_MEM_INSTANCE_MEM_RATIO) - driver_mem_overhead_mb = int(driver_mem_mb * constants.DRIVER_MEM_OVERHEAD_RATIO) - executor_mem_mb = int( - ((instance_mem_mb * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO) / executor_count_per_instance) - * (1 - constants.EXECUTOR_MEM_OVERHEAD_RATIO) - ) - executor_mem_overhead_mb = int(executor_mem_mb * constants.EXECUTOR_MEM_OVERHEAD_RATIO) + spark_config_mode = int(os.getenv("AWS_SPARK_CONFIG_MODE", str(constants.AWS_SPARK_CONFIG_MODE_STANDARD))) + + if spark_config_mode == constants.AWS_SPARK_CONFIG_MODE_STANDARD: + executor_cores = instance_cores + executor_count_per_instance = int(instance_cores / executor_cores) + executor_count_total = instance_count * executor_count_per_instance + default_parallelism = instance_count * instance_cores * 2 + + driver_mem_mb = int(instance_mem_mb * constants.DRIVER_MEM_INSTANCE_MEM_RATIO) + driver_mem_overhead_mb = int(driver_mem_mb * constants.DRIVER_MEM_OVERHEAD_RATIO) + executor_mem_mb = int( + ((instance_mem_mb * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO) / executor_count_per_instance) + * (1 - constants.EXECUTOR_MEM_OVERHEAD_RATIO) + ) + executor_mem_overhead_mb = int(executor_mem_mb * constants.EXECUTOR_MEM_OVERHEAD_RATIO) + elif spark_config_mode == constants.AWS_SPARK_CONFIG_MODE_ADVANCED: + # memory reduction (safer choice) + reduced_instance_mem_mb = int(instance_mem_mb * constants.SAFE_MEMORY_REDUCTION_RATIO) + # executor cores (set to 5 as constant) + executor_cores = constants.EXECUTOR_CORES + if executor_cores >= instance_cores: + executor_cores = instance_cores - 1 + # executor count per instance, subtract 1 core from the instance cores to save for the Hadoop daemons + executor_count_per_instance = int((instance_cores - 1) / executor_cores) + # executor instances, leave 1 slot for the driver + executor_count_total = (instance_count * executor_count_per_instance) - 1 + # default parallelism + default_parallelism = executor_count_total * executor_cores * 2 + # total memory for one executor on the instance, leave 1GB for the Hadoop daemons + total_executor_memory = int((reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance) + # executor memory MB (90% of the total executor mem) + executor_mem_mb = int(total_executor_memory * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO_ADV) + # executor memory overhead MB (10% of the total executor mem) + executor_mem_overhead_mb = int(total_executor_memory * constants.EXECUTOR_MEM_OVERHEAD_RATIO) + # setting driver memory as the executor memory + driver_mem_mb = executor_mem_mb + driver_mem_overhead_mb = executor_mem_overhead_mb + else: + raise ValueError( + "Could not determine Spark configuration mode: {}.".format( + spark_config_mode + ) + ) driver_gc_config = ( "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 " diff --git a/src/smspark/constants.py b/src/smspark/constants.py index d2de860..59b7663 100644 --- a/src/smspark/constants.py +++ b/src/smspark/constants.py @@ -3,3 +3,10 @@ DRIVER_MEM_OVERHEAD_RATIO = 0.1 EXECUTOR_MEM_INSTANCE_MEM_RATIO = 0.95 EXECUTOR_MEM_OVERHEAD_RATIO = 0.1 + +EXECUTOR_CORES = 5 +HADOOP_DAEMONS_MEM_MB = 1024 +SAFE_MEMORY_REDUCTION_RATIO = 0.95 +EXECUTOR_MEM_INSTANCE_MEM_RATIO_ADV = 0.90 +AWS_SPARK_CONFIG_MODE_STANDARD = 1 +AWS_SPARK_CONFIG_MODE_ADVANCED = 2 From 754ca93d39e3fef7c05a05d1b2a73447fc14d66f Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Wed, 21 Jun 2023 15:15:56 -0700 Subject: [PATCH 02/15] Test commit for advanced spark setup --- smsparkbuild/py39/Pipfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smsparkbuild/py39/Pipfile b/smsparkbuild/py39/Pipfile index f72699c..fdd8f94 100644 --- a/smsparkbuild/py39/Pipfile +++ b/smsparkbuild/py39/Pipfile @@ -12,7 +12,7 @@ click = "==8.1.2" watchdog = "==0.10.3" waitress = "==2.1.2" types-waitress = "==2.0.6" -requests = "==2.27.1" +requests = "==2.31.0" types-requests = "==2.27.16" rsa = "==4.9" pyasn1 = "==0.4.8" From f6615b3d7a423a1c257a25069440222c273077fb Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Wed, 21 Jun 2023 15:15:56 -0700 Subject: [PATCH 03/15] Test commit for advanced spark setup --- smsparkbuild/py39/Pipfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/smsparkbuild/py39/Pipfile b/smsparkbuild/py39/Pipfile index fdd8f94..c900f07 100644 --- a/smsparkbuild/py39/Pipfile +++ b/smsparkbuild/py39/Pipfile @@ -36,6 +36,8 @@ pytest-parallel = "==0.1.1" pytest-rerunfailures = "10.0" numpy = "==1.22.2" py = "==1.11.0" +botocore = "==1.29.10" +PyYAML = "==5.3" [requires] python_version = "3.9" From c5e844b77459ea65fcbc29a7c76a26d31ce8ed12 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Mon, 17 Jul 2023 07:54:06 -0700 Subject: [PATCH 04/15] upgrade to new version --- smsparkbuild/py39/Pipfile | 2 -- spark/processing/3.3/py3/yum/emr-apps.repo | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/smsparkbuild/py39/Pipfile b/smsparkbuild/py39/Pipfile index c900f07..fdd8f94 100644 --- a/smsparkbuild/py39/Pipfile +++ b/smsparkbuild/py39/Pipfile @@ -36,8 +36,6 @@ pytest-parallel = "==0.1.1" pytest-rerunfailures = "10.0" numpy = "==1.22.2" py = "==1.11.0" -botocore = "==1.29.10" -PyYAML = "==5.3" [requires] python_version = "3.9" diff --git a/spark/processing/3.3/py3/yum/emr-apps.repo b/spark/processing/3.3/py3/yum/emr-apps.repo index 128fa63..01c04c4 100644 --- a/spark/processing/3.3/py3/yum/emr-apps.repo +++ b/spark/processing/3.3/py3/yum/emr-apps.repo @@ -1,8 +1,8 @@ [emr-apps] name = EMR Application Repository -gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3/repoPublicKey.txt +gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10/repoPublicKey.txt enabled = 1 -baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3 +baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10 priority = 5 gpgcheck = 0 From a53f3705648fcce869cfffd21544f38ee0384b47 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 12:00:41 -0700 Subject: [PATCH 05/15] Using previous EMR release --- spark/processing/3.3/py3/yum/emr-apps.repo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/processing/3.3/py3/yum/emr-apps.repo b/spark/processing/3.3/py3/yum/emr-apps.repo index 01c04c4..128fa63 100644 --- a/spark/processing/3.3/py3/yum/emr-apps.repo +++ b/spark/processing/3.3/py3/yum/emr-apps.repo @@ -1,8 +1,8 @@ [emr-apps] name = EMR Application Repository -gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10/repoPublicKey.txt +gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3/repoPublicKey.txt enabled = 1 -baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10 +baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3 priority = 5 gpgcheck = 0 From 98fef80887aba259575fe38119fe487c85314dff Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 12:09:58 -0700 Subject: [PATCH 06/15] Updating image minor version --- new_images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/new_images.yml b/new_images.yml index d71cc03..3d7f115 100644 --- a/new_images.yml +++ b/new_images.yml @@ -4,4 +4,4 @@ new_images: use-case: "processing" processors: ["cpu"] python: ["py39"] - sm_version: "1.1" + sm_version: "1.2" From 7a4f7d8567829610f0b7b9a522dabb4832ca55e6 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 12:30:18 -0700 Subject: [PATCH 07/15] Fix formatting --- src/smspark/bootstrapper.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/smspark/bootstrapper.py b/src/smspark/bootstrapper.py index 2e83a96..ac1c46e 100644 --- a/src/smspark/bootstrapper.py +++ b/src/smspark/bootstrapper.py @@ -417,7 +417,9 @@ def get_yarn_spark_resource_config( # default parallelism default_parallelism = executor_count_total * executor_cores * 2 # total memory for one executor on the instance, leave 1GB for the Hadoop daemons - total_executor_memory = int((reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance) + total_executor_memory = int( + (reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance + ) # executor memory MB (90% of the total executor mem) executor_mem_mb = int(total_executor_memory * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO_ADV) # executor memory overhead MB (10% of the total executor mem) @@ -426,11 +428,7 @@ def get_yarn_spark_resource_config( driver_mem_mb = executor_mem_mb driver_mem_overhead_mb = executor_mem_overhead_mb else: - raise ValueError( - "Could not determine Spark configuration mode: {}.".format( - spark_config_mode - ) - ) + raise ValueError("Could not determine Spark configuration mode: {}.".format(spark_config_mode)) driver_gc_config = ( "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 " From 1652c5e3a63b3ea5fe6c9f98b3f5256538403883 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:18:44 -0700 Subject: [PATCH 08/15] Revert "Fix formatting" This reverts commit 7a4f7d8567829610f0b7b9a522dabb4832ca55e6. --- src/smspark/bootstrapper.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/smspark/bootstrapper.py b/src/smspark/bootstrapper.py index ac1c46e..2e83a96 100644 --- a/src/smspark/bootstrapper.py +++ b/src/smspark/bootstrapper.py @@ -417,9 +417,7 @@ def get_yarn_spark_resource_config( # default parallelism default_parallelism = executor_count_total * executor_cores * 2 # total memory for one executor on the instance, leave 1GB for the Hadoop daemons - total_executor_memory = int( - (reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance - ) + total_executor_memory = int((reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance) # executor memory MB (90% of the total executor mem) executor_mem_mb = int(total_executor_memory * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO_ADV) # executor memory overhead MB (10% of the total executor mem) @@ -428,7 +426,11 @@ def get_yarn_spark_resource_config( driver_mem_mb = executor_mem_mb driver_mem_overhead_mb = executor_mem_overhead_mb else: - raise ValueError("Could not determine Spark configuration mode: {}.".format(spark_config_mode)) + raise ValueError( + "Could not determine Spark configuration mode: {}.".format( + spark_config_mode + ) + ) driver_gc_config = ( "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 " From 7e306bd0979cd76f7fcc738571642d77dbc3faea Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:18:55 -0700 Subject: [PATCH 09/15] Revert "Updating image minor version" This reverts commit 98fef80887aba259575fe38119fe487c85314dff. --- new_images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/new_images.yml b/new_images.yml index 3d7f115..d71cc03 100644 --- a/new_images.yml +++ b/new_images.yml @@ -4,4 +4,4 @@ new_images: use-case: "processing" processors: ["cpu"] python: ["py39"] - sm_version: "1.2" + sm_version: "1.1" From e319e5dc8c33d6fddeae0bf1435bc04938d6b91e Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:19:06 -0700 Subject: [PATCH 10/15] Revert "Using previous EMR release" This reverts commit a53f3705648fcce869cfffd21544f38ee0384b47. --- spark/processing/3.3/py3/yum/emr-apps.repo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/processing/3.3/py3/yum/emr-apps.repo b/spark/processing/3.3/py3/yum/emr-apps.repo index 128fa63..01c04c4 100644 --- a/spark/processing/3.3/py3/yum/emr-apps.repo +++ b/spark/processing/3.3/py3/yum/emr-apps.repo @@ -1,8 +1,8 @@ [emr-apps] name = EMR Application Repository -gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3/repoPublicKey.txt +gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10/repoPublicKey.txt enabled = 1 -baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3 +baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10 priority = 5 gpgcheck = 0 From dcb0fa84aaaff72a060eaaf6903bcfb9848661d7 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:19:14 -0700 Subject: [PATCH 11/15] Revert "upgrade to new version" This reverts commit c5e844b77459ea65fcbc29a7c76a26d31ce8ed12. --- smsparkbuild/py39/Pipfile | 2 ++ spark/processing/3.3/py3/yum/emr-apps.repo | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/smsparkbuild/py39/Pipfile b/smsparkbuild/py39/Pipfile index fdd8f94..c900f07 100644 --- a/smsparkbuild/py39/Pipfile +++ b/smsparkbuild/py39/Pipfile @@ -36,6 +36,8 @@ pytest-parallel = "==0.1.1" pytest-rerunfailures = "10.0" numpy = "==1.22.2" py = "==1.11.0" +botocore = "==1.29.10" +PyYAML = "==5.3" [requires] python_version = "3.9" diff --git a/spark/processing/3.3/py3/yum/emr-apps.repo b/spark/processing/3.3/py3/yum/emr-apps.repo index 01c04c4..128fa63 100644 --- a/spark/processing/3.3/py3/yum/emr-apps.repo +++ b/spark/processing/3.3/py3/yum/emr-apps.repo @@ -1,8 +1,8 @@ [emr-apps] name = EMR Application Repository -gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10/repoPublicKey.txt +gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3/repoPublicKey.txt enabled = 1 -baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.11.0/8be6ded3-04c9-441d-a880-07669e41ea10 +baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.10.0/44edf8a3-456b-4648-ac62-47f7962064e3 priority = 5 gpgcheck = 0 From fe44eb52a611f7edeed864181f3b12c001c0e9c9 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:19:25 -0700 Subject: [PATCH 12/15] Revert "Test commit for advanced spark setup" This reverts commit f6615b3d7a423a1c257a25069440222c273077fb. --- smsparkbuild/py39/Pipfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/smsparkbuild/py39/Pipfile b/smsparkbuild/py39/Pipfile index c900f07..fdd8f94 100644 --- a/smsparkbuild/py39/Pipfile +++ b/smsparkbuild/py39/Pipfile @@ -36,8 +36,6 @@ pytest-parallel = "==0.1.1" pytest-rerunfailures = "10.0" numpy = "==1.22.2" py = "==1.11.0" -botocore = "==1.29.10" -PyYAML = "==5.3" [requires] python_version = "3.9" From 645861e8af5bd2f868ab52543a9d7cc6fcf8bb8e Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:19:34 -0700 Subject: [PATCH 13/15] Revert "Test commit for advanced spark setup" This reverts commit 754ca93d39e3fef7c05a05d1b2a73447fc14d66f. --- smsparkbuild/py39/Pipfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smsparkbuild/py39/Pipfile b/smsparkbuild/py39/Pipfile index fdd8f94..f72699c 100644 --- a/smsparkbuild/py39/Pipfile +++ b/smsparkbuild/py39/Pipfile @@ -12,7 +12,7 @@ click = "==8.1.2" watchdog = "==0.10.3" waitress = "==2.1.2" types-waitress = "==2.0.6" -requests = "==2.31.0" +requests = "==2.27.1" types-requests = "==2.27.16" rsa = "==4.9" pyasn1 = "==0.4.8" From 627fdd5be0cf3104a1f1eacf42f1c1526642f931 Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:19:43 -0700 Subject: [PATCH 14/15] Revert "Test commit for advanced spark setup" This reverts commit a0f36444a2904b6a42d6190753a81fd737c7ad43. --- src/smspark/bootstrapper.py | 55 ++++++++----------------------------- src/smspark/constants.py | 7 ----- 2 files changed, 12 insertions(+), 50 deletions(-) diff --git a/src/smspark/bootstrapper.py b/src/smspark/bootstrapper.py index 2e83a96..db7c758 100644 --- a/src/smspark/bootstrapper.py +++ b/src/smspark/bootstrapper.py @@ -388,49 +388,18 @@ def get_yarn_spark_resource_config( self, instance_count: int, instance_mem_mb: int, instance_cores: int ) -> Tuple[Configuration, Configuration]: aws_region = os.getenv("AWS_REGION") - spark_config_mode = int(os.getenv("AWS_SPARK_CONFIG_MODE", str(constants.AWS_SPARK_CONFIG_MODE_STANDARD))) - - if spark_config_mode == constants.AWS_SPARK_CONFIG_MODE_STANDARD: - executor_cores = instance_cores - executor_count_per_instance = int(instance_cores / executor_cores) - executor_count_total = instance_count * executor_count_per_instance - default_parallelism = instance_count * instance_cores * 2 - - driver_mem_mb = int(instance_mem_mb * constants.DRIVER_MEM_INSTANCE_MEM_RATIO) - driver_mem_overhead_mb = int(driver_mem_mb * constants.DRIVER_MEM_OVERHEAD_RATIO) - executor_mem_mb = int( - ((instance_mem_mb * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO) / executor_count_per_instance) - * (1 - constants.EXECUTOR_MEM_OVERHEAD_RATIO) - ) - executor_mem_overhead_mb = int(executor_mem_mb * constants.EXECUTOR_MEM_OVERHEAD_RATIO) - elif spark_config_mode == constants.AWS_SPARK_CONFIG_MODE_ADVANCED: - # memory reduction (safer choice) - reduced_instance_mem_mb = int(instance_mem_mb * constants.SAFE_MEMORY_REDUCTION_RATIO) - # executor cores (set to 5 as constant) - executor_cores = constants.EXECUTOR_CORES - if executor_cores >= instance_cores: - executor_cores = instance_cores - 1 - # executor count per instance, subtract 1 core from the instance cores to save for the Hadoop daemons - executor_count_per_instance = int((instance_cores - 1) / executor_cores) - # executor instances, leave 1 slot for the driver - executor_count_total = (instance_count * executor_count_per_instance) - 1 - # default parallelism - default_parallelism = executor_count_total * executor_cores * 2 - # total memory for one executor on the instance, leave 1GB for the Hadoop daemons - total_executor_memory = int((reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance) - # executor memory MB (90% of the total executor mem) - executor_mem_mb = int(total_executor_memory * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO_ADV) - # executor memory overhead MB (10% of the total executor mem) - executor_mem_overhead_mb = int(total_executor_memory * constants.EXECUTOR_MEM_OVERHEAD_RATIO) - # setting driver memory as the executor memory - driver_mem_mb = executor_mem_mb - driver_mem_overhead_mb = executor_mem_overhead_mb - else: - raise ValueError( - "Could not determine Spark configuration mode: {}.".format( - spark_config_mode - ) - ) + executor_cores = instance_cores + executor_count_per_instance = int(instance_cores / executor_cores) + executor_count_total = instance_count * executor_count_per_instance + default_parallelism = instance_count * instance_cores * 2 + + driver_mem_mb = int(instance_mem_mb * constants.DRIVER_MEM_INSTANCE_MEM_RATIO) + driver_mem_overhead_mb = int(driver_mem_mb * constants.DRIVER_MEM_OVERHEAD_RATIO) + executor_mem_mb = int( + ((instance_mem_mb * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO) / executor_count_per_instance) + * (1 - constants.EXECUTOR_MEM_OVERHEAD_RATIO) + ) + executor_mem_overhead_mb = int(executor_mem_mb * constants.EXECUTOR_MEM_OVERHEAD_RATIO) driver_gc_config = ( "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 " diff --git a/src/smspark/constants.py b/src/smspark/constants.py index 59b7663..d2de860 100644 --- a/src/smspark/constants.py +++ b/src/smspark/constants.py @@ -3,10 +3,3 @@ DRIVER_MEM_OVERHEAD_RATIO = 0.1 EXECUTOR_MEM_INSTANCE_MEM_RATIO = 0.95 EXECUTOR_MEM_OVERHEAD_RATIO = 0.1 - -EXECUTOR_CORES = 5 -HADOOP_DAEMONS_MEM_MB = 1024 -SAFE_MEMORY_REDUCTION_RATIO = 0.95 -EXECUTOR_MEM_INSTANCE_MEM_RATIO_ADV = 0.90 -AWS_SPARK_CONFIG_MODE_STANDARD = 1 -AWS_SPARK_CONFIG_MODE_ADVANCED = 2 From e6b33bd682cacfaf0a22e8a58a05a2b514d55d5b Mon Sep 17 00:00:00 2001 From: Sumit Awasthi Date: Fri, 21 Jul 2023 13:22:45 -0700 Subject: [PATCH 15/15] Revert "Revert "Fix formatting"" This reverts commit 1652c5e3a63b3ea5fe6c9f98b3f5256538403883. --- src/smspark/bootstrapper.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/smspark/bootstrapper.py b/src/smspark/bootstrapper.py index 2e83a96..ac1c46e 100644 --- a/src/smspark/bootstrapper.py +++ b/src/smspark/bootstrapper.py @@ -417,7 +417,9 @@ def get_yarn_spark_resource_config( # default parallelism default_parallelism = executor_count_total * executor_cores * 2 # total memory for one executor on the instance, leave 1GB for the Hadoop daemons - total_executor_memory = int((reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance) + total_executor_memory = int( + (reduced_instance_mem_mb - constants.HADOOP_DAEMONS_MEM_MB) / executor_count_per_instance + ) # executor memory MB (90% of the total executor mem) executor_mem_mb = int(total_executor_memory * constants.EXECUTOR_MEM_INSTANCE_MEM_RATIO_ADV) # executor memory overhead MB (10% of the total executor mem) @@ -426,11 +428,7 @@ def get_yarn_spark_resource_config( driver_mem_mb = executor_mem_mb driver_mem_overhead_mb = executor_mem_overhead_mb else: - raise ValueError( - "Could not determine Spark configuration mode: {}.".format( - spark_config_mode - ) - ) + raise ValueError("Could not determine Spark configuration mode: {}.".format(spark_config_mode)) driver_gc_config = ( "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 "