Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add batch size control for column statistics analysis #135

Merged
merged 1 commit into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions src/Dax.Model.Extractor/StatExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ namespace Dax.Model.Extractor
{
public class StatExtractor
{
/// <summary>
/// The default number of rows processed at a time during column statistics analysis.
/// </summary>
public const int DefaultColumnBatchSize = 50;

protected Dax.Metadata.Model DaxModel { get; private set; }
protected IDbConnection Connection { get; private set; }
protected int CommandTimeout { get; private set; } = 0;
Expand All @@ -26,7 +31,7 @@ protected IDbCommand CreateCommand(string commandText)

// UpdateStatisticsModel has been marked as obsolete because its usage may require rerunning the DMVs for models with DirectLake partitions. Since this logic should be handled by the library, we may consider removing it from the public APIs in a future release.
[Obsolete("This method may produce incomplete results if used on a model with DirectLake partitions and DirectLakeExtractionMode parameter set to anything other than ResidentOnly. Use TomExtractor.GetDaxModel instead.")]
public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnection connection, int sampleRows = 0, bool analyzeDirectQuery = false , DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnection connection, int sampleRows = 0, bool analyzeDirectQuery = false , DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int columnBatchSize = DefaultColumnBatchSize)
{
// TODO: Remove after rafactoring the code to use ExtractorSettings: ExtractorProperties as a parameter
daxModel.ExtractorProperties.StatisticsEnabled = true;
Expand All @@ -36,7 +41,7 @@ public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnect

StatExtractor extractor = new StatExtractor(daxModel, connection);
extractor.LoadTableStatistics(analyzeDirectQuery, analyzeDirectLake);
extractor.LoadColumnStatistics(analyzeDirectQuery, analyzeDirectLake);
extractor.LoadColumnStatistics(analyzeDirectQuery, analyzeDirectLake, columnBatchSize);
extractor.LoadRelationshipStatistics(sampleRows, analyzeDirectQuery, analyzeDirectLake);

// Update ExtractionDate
Expand Down Expand Up @@ -242,7 +247,7 @@ private static string EmbedNameInString(string originalName)
{
return originalName.Replace("\"", "\"\"");
}
private void LoadColumnStatistics(bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
private void LoadColumnStatistics(bool analyzeDirectQuery, DirectLakeExtractionMode analyzeDirectLake, int columnBatchSize)
{
var allColumns =
(from t in DaxModel.Tables
Expand All @@ -257,7 +262,7 @@ from c in t.Columns
|| (analyzeDirectLake == DirectLakeExtractionMode.Full)
)
select c).ToList();
var loopColumns = allColumns.SplitList(50); // no more than 9999
var loopColumns = allColumns.SplitList(columnBatchSize); // no more than 9999
foreach ( var columnSet in loopColumns ) {
var idString = 0;
var dax = "EVALUATE ";
Expand Down
8 changes: 4 additions & 4 deletions src/Dax.Model.Extractor/TomExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ public static Dax.Metadata.Model GetDaxModel(Tom.Model model, string extractorAp
return extractor.DaxModel;
}

public static Dax.Metadata.Model GetDaxModel(string connectionString, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
public static Dax.Metadata.Model GetDaxModel(string connectionString, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int statsColumnBatchSize = StatExtractor.DefaultColumnBatchSize)
{
Tom.Server server = new Tom.Server();
server.Connect(connectionString);
Expand All @@ -304,7 +304,7 @@ public static Dax.Metadata.Model GetDaxModel(string connectionString, string app
if (readStatisticsFromData)
{
#pragma warning disable CS0618 // Type or member is obsolete
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake);
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake, statsColumnBatchSize);
#pragma warning restore CS0618 // Type or member is obsolete

// If model has any DL partitions and we have forced all columns into memory then re-run the DMVs to update the data with the new values after everything has been transcoded.
Expand Down Expand Up @@ -334,7 +334,7 @@ public static Tom.Database GetDatabase(string connectionString)
return db ?? throw new ArgumentException($"The database '{databaseName}' could not be found. Either it does not exist or you do not have admin rights to it.");
}

public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseName, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseName, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int statsColumnBatchSize = StatExtractor.DefaultColumnBatchSize)
{
Tom.Database db = GetDatabase(serverName, databaseName);
Tom.Model tomModel = db.Model;
Expand All @@ -352,7 +352,7 @@ public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseN
if (readStatisticsFromData)
{
#pragma warning disable CS0618 // Type or member is obsolete
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake);
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake, statsColumnBatchSize);
#pragma warning restore CS0618 // Type or member is obsolete

// If model has any DL partitions and we have forced all columns into memory then re-run the DMVs to update the data with the new values after everything has been transcoded.
Expand Down