From 8473b8c8e67ce270de3e19e044a4c8651c82f027 Mon Sep 17 00:00:00 2001 From: Michael Nebel Date: Tue, 3 Sep 2024 10:57:00 +0200 Subject: [PATCH] Java: Prototype implementation. --- .../CaptureContentSummaryModels.ql | 2 +- .../modelgenerator/internal/CaptureModels.qll | 219 +++++++++++++++++- .../internal/CaptureModelsSpecific.qll | 26 ++- .../CaptureTypeBasedSummaryModels.qll | 2 +- .../mad/modelgenerator/ModelPrinting.qll | 43 ++-- 5 files changed, 262 insertions(+), 30 deletions(-) diff --git a/java/ql/src/utils/modelgenerator/CaptureContentSummaryModels.ql b/java/ql/src/utils/modelgenerator/CaptureContentSummaryModels.ql index e0e793348f59b..6afc83f12fc16 100644 --- a/java/ql/src/utils/modelgenerator/CaptureContentSummaryModels.ql +++ b/java/ql/src/utils/modelgenerator/CaptureContentSummaryModels.ql @@ -10,4 +10,4 @@ import internal.CaptureModels from DataFlowSummaryTargetApi api, string flow where flow = captureContentFlow(api) -select flow order by flow +select api, flow order by flow diff --git a/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll b/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll index d476376d3ccc0..c6a31499c91fe 100644 --- a/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll +++ b/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll @@ -127,7 +127,7 @@ string captureQualifierFlow(DataFlowSummaryTargetApi api) { api = returnNodeEnclosingCallable(ret) and isOwnInstanceAccessNode(ret) ) and - result = Printing::asValueModel(api, qualifierString(), "ReturnValue") + result = Printing::asLiftedValueModel(api, qualifierString(), "ReturnValue") } private int accessPathLimit0() { result = 2 } @@ -237,7 +237,7 @@ string captureThroughFlow0( input = parameterNodeAsInput(p) and output = getOutput(returnNodeExt) and input != output and - result = Printing::asTaintModel(api, input, output) + result = Printing::asLiftedTaintModel(api, input, output) ) } @@ -279,6 +279,9 @@ private module PropagateContentFlowConfig implements ContentDataFlow::ConfigSig private module PropagateContentFlow = ContentDataFlow::Global; +/** + * Gets a MaD string representation of a store step access path. + */ private string printStoreAccessPath(PropagateContentFlow::AccessPath ap) { not exists(ap.getHead()) and result = "" or @@ -289,6 +292,9 @@ private string printStoreAccessPath(PropagateContentFlow::AccessPath ap) { ) } +/** + * Gets a MaD string representation of a read step access path. + */ private string printReadAccessPath(PropagateContentFlow::AccessPath ap) { not exists(ap.getHead()) and result = "" or @@ -299,18 +305,217 @@ private string printReadAccessPath(PropagateContentFlow::AccessPath ap) { ) } -string captureContentFlow(DataFlowSummaryTargetApi api) { +/** + * Holds if the access path `ap` contains a field or synthetic field access. + */ +private predicate mentionsField(PropagateContentFlow::AccessPath ap) { + exists(ContentSet head, PropagateContentFlow::AccessPath tail | + head = ap.getHead() and + tail = ap.getTail() and + (mentionsField(tail) or isField(head)) + ) +} + +pragma[nomagic] +private predicate apiContentFlow( + DataFlowSummaryTargetApi api, DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads, + ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores, boolean preservesValue +) { + PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and + returnNodeExt.getEnclosingCallable() = api and + p.getEnclosingCallable() = api +} + +/** + * A class of APIs relevant for modeling using content flow. + * The following heuristic is applied: + * Content flow is only relevant for an API, if + * #content flow <= 2 * #parameters + 3 + * If an API produces more content flow, it is likely that + * 1. Types are not sufficiently constrained leading to combinatorial + * explosion in dispatch and thus in the generated summaries. + * 2. It is a reasonable approximation to use the non-content based flow + * detection instead, as reads and stores would use a significant + * part of the object internal state. + */ +private class ContentDataFlowSummaryTargetApi extends DataFlowSummaryTargetApi { + ContentDataFlowSummaryTargetApi() { + count(DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads, + ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores | + apiContentFlow(this, p, reads, returnNodeExt, stores, _) + ) <= 2 * this.getNumberOfParameters() + 3 + } +} + +/** + * Holds if any of the content sets in `path` translates into a synthetic field. + */ +private predicate hasSyntheticContent(PropagateContentFlow::AccessPath path) { + exists(PropagateContentFlow::AccessPath tail, ContentSet head | + head = path.getHead() and + tail = path.getTail() and + ( + exists(getSyntheticName(head)) or + hasSyntheticContent(tail) + ) + ) +} + +/** + * A module containing predicates for validating access paths containing synthetic fields + * to be used for generated summary models. + */ +private module AccessPathSyntheticValidation { + /** + * Holds if there exist an API within the type `t` that has + * content flow from `read` to `store`. + */ + private predicate step( + Type t, PropagateContentFlow::AccessPath read, PropagateContentFlow::AccessPath store + ) { + exists(ContentDataFlowSummaryTargetApi api | + api.getDeclaringType() = t and + apiContentFlow(api, _, read, _, store, _) + ) + } + + /** + * Holds if there is an API in type `t` that has content flow from `read` to `store`, + * where `read` does not have synthetic content and `store` does. + * + * Step A -> Synth. + */ + private predicate synthPathEntry( + Type t, PropagateContentFlow::AccessPath read, PropagateContentFlow::AccessPath store + ) { + not hasSyntheticContent(read) and + hasSyntheticContent(store) and + step(t, read, store) + } + + /** + * Holds if there is an API in type `t` that has content flow from `read` to `store`, + * where `read` has synthetic content and `store` does not. + * + * Step Synth -> A. + */ + private predicate synthPathExit( + Type t, PropagateContentFlow::AccessPath read, PropagateContentFlow::AccessPath store + ) { + hasSyntheticContent(read) and + not hasSyntheticContent(store) and + step(t, read, store) + } + + /** + * Takes zero or more synthetic steps. + * Synth ->* Synth + */ + private predicate synthPathStepRec( + Type t, PropagateContentFlow::AccessPath read, PropagateContentFlow::AccessPath store + ) { + hasSyntheticContent(read) and + hasSyntheticContent(store) and + ( + read = store + or + exists(PropagateContentFlow::AccessPath mid | + step(t, read, mid) and synthPathStepRec(t, mid, store) + ) + ) + } + + /** + * Holds if there exists a path of steps from `ap1` to an exit. + * ap1 ->* Synth -> A + */ + private predicate reachesSynthExit(Type t, PropagateContentFlow::AccessPath ap1) { + exists(PropagateContentFlow::AccessPath mid | + synthPathStepRec(t, ap1, mid) and synthPathExit(t, mid, _) + ) + } + + /** + * Holds if there exists a path of steps from an entry to `ap1`. + * + * A -> Synth ->* ap1 + */ + private predicate synthEntryReaches(Type t, PropagateContentFlow::AccessPath ap1) { + exists(PropagateContentFlow::AccessPath mid | + synthPathEntry(t, _, mid) and synthPathStepRec(t, mid, ap1) + ) + } + + /** + * Holds if at least one of the access paths `read` and `store` contain content + * that will be translated into a synthetic field, when being used in + * a MaD summary model, and if the type `t` contains a range of APIs, such that + * when chaining their flow access paths, there exists access paths `A` and `B` where + * A ->* read -> store ->* B and where `A` and `B` do not contain content that will + * be translated into a synthetic field. + * + * This is needed because we don't want to include summaries that reads from or + * stores into a "dead" synthetic field. + * + * Example: + * Assume we have a type `t` with methods `getX` and `setX`, which gets and sets + * a private field `X` on `t`. This would lead to the following content flows + * getX : Argument[this].SyntheticField[t.X] -> ReturnValue. + * setX : Argument[0] -> Argument[this].SyntheticField[t.X] + * As the reads and stores are on synthetic fields we should only make summaries + * if both of these methods exist. + */ + pragma[nomagic] + predicate acceptReadStore( + Type t, PropagateContentFlow::AccessPath read, PropagateContentFlow::AccessPath store + ) { + synthPathEntry(t, read, store) and reachesSynthExit(t, store) + or + synthEntryReaches(t, read) and synthPathExit(t, read, store) + or + synthEntryReaches(t, read) and step(t, read, store) and reachesSynthExit(t, store) + } +} + +/** + * Holds, if the API `api` has relevant flow from `read` on `p` to `store` on `returnNodeExt`. + * Flow is considered relevant, + * 1. If `read` or `store` do not contain content that translates into synthetic fields. + * 2. If `read` or `store` contain content that translates into synthetic fields, and + * the synthetic content is "live" on the declaring type of `api`. + */ +private predicate apiRelevantContentFlow( + ContentDataFlowSummaryTargetApi api, DataFlow::ParameterNode p, + PropagateContentFlow::AccessPath read, ReturnNodeExt returnNodeExt, + PropagateContentFlow::AccessPath store, boolean preservesValue +) { + apiContentFlow(api, p, read, returnNodeExt, store, preservesValue) and + ( + not hasSyntheticContent(read) and not hasSyntheticContent(store) + or + AccessPathSyntheticValidation::acceptReadStore(api.getDeclaringType(), read, store) + ) +} + +/** + * Gets the content based summary model(s) of `api`, if there is flow from a parameter to + * the return value or a parameter. + * + * Models are lifted to the best type in case the read and store access paths do not + * contain a field or synthetic field access. + */ +string captureContentFlow(ContentDataFlowSummaryTargetApi api) { exists( DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt, string input, string output, PropagateContentFlow::AccessPath reads, PropagateContentFlow::AccessPath stores, - boolean preservesValue + boolean preservesValue, boolean lift | - PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and - returnNodeExt.getEnclosingCallable() = api and + apiRelevantContentFlow(api, p, reads, returnNodeExt, stores, preservesValue) and input = parameterNodeAsContentInput(p) + printReadAccessPath(reads) and output = getContentOutput(returnNodeExt) + printStoreAccessPath(stores) and input != output and - result = Printing::asModel(api, input, output, preservesValue) + (if mentionsField(reads) or mentionsField(stores) then lift = false else lift = true) and + result = Printing::asModel(api, input, output, preservesValue, lift) ) } diff --git a/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll b/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll index 2be162d5f9b47..e75a6caefbcb0 100644 --- a/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll +++ b/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll @@ -339,17 +339,31 @@ predicate isAdditionalContentFlowStep(DataFlow::Node node1, DataFlow::Node node2 ) } +/** Holds if the contentset `c` is a field or synthetic field. */ +predicate isField(ContentSet c) { + c instanceof DataFlowUtil::FieldContent or + c instanceof DataFlowUtil::SyntheticFieldContent +} + /** - * Gets the MaD string representation of the contentset `c`. + * Gets the MaD synthetic name string representation for the contentset `c`, if any. */ +string getSyntheticName(DataFlow::ContentSet c) { + exists(Field f | + not f.isPublic() and + f = c.(DataFlowUtil::FieldContent).getField() and + result = f.getQualifiedName() + ) + or + result = c.(DataFlowUtil::SyntheticFieldContent).getField() +} + string printContent(ContentSet c) { - exists(Field f, string name | - f = c.(DataFlowUtil::FieldContent).getField() and name = f.getQualifiedName() - | - if f.isPublic() then result = "Field[" + name + "]" else result = "SyntheticField[" + name + "]" + exists(Field f | f = c.(DataFlowUtil::FieldContent).getField() and f.isPublic() | + result = "Field[" + f.getQualifiedName() + "]" ) or - result = "SyntheticField[" + c.(DataFlowUtil::SyntheticFieldContent).getField() + "]" + result = "SyntheticField[" + getSyntheticName(c) + "]" or c instanceof DataFlowUtil::CollectionContent and result = "Element" or diff --git a/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll b/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll index 34e53bad2d7fe..3d56dff50726b 100644 --- a/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll +++ b/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll @@ -329,7 +329,7 @@ class TypeBasedFlowTargetApi extends Specific::SummaryTargetApi { output(this, tv, output) and input != output | - result = Printing::asValueModel(this, input, output) + result = Printing::asLiftedValueModel(this, input, output) ) } } diff --git a/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll b/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll index 2867d927984d1..4f5fa59d5377b 100644 --- a/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll +++ b/shared/mad/codeql/mad/modelgenerator/ModelPrinting.qll @@ -64,14 +64,23 @@ module ModelPrintingImpl { /** * Gets the summary model for `api` with `input`, `output` and `kind`. + * The model is lifted in case `lift` is true. */ bindingset[input, output, kind] - private string asSummaryModel(Printing::SummaryApi api, string input, string output, string kind) { - result = - asPartialModel(api.lift()) + input + ";" // - + output + ";" // - + kind + ";" // - + Printing::getProvenance() + private string asSummaryModel( + Printing::SummaryApi api, string input, string output, string kind, boolean lift + ) { + exists(Lang::Callable c | + lift = true and c = api.lift() + or + lift = false and c = api + | + result = + asPartialModel(c) + input + ";" // + + output + ";" // + + kind + ";" // + + Printing::getProvenance() + ) } string asNeutralSummaryModel(Printing::SummaryApi api) { @@ -82,31 +91,35 @@ module ModelPrintingImpl { } /** - * Gets the value summary model for `api` with `input` and `output`. + * Gets the lifted value summary model for `api` with `input` and `output`. */ bindingset[input, output] - string asValueModel(Printing::SummaryApi api, string input, string output) { - result = asSummaryModel(api, input, output, "value") + string asLiftedValueModel(Printing::SummaryApi api, string input, string output) { + result = asModel(api, input, output, true, true) } /** - * Gets the taint summary model for `api` with `input` and `output`. + * Gets the lifted taint summary model for `api` with `input` and `output`. */ bindingset[input, output] - string asTaintModel(Printing::SummaryApi api, string input, string output) { - result = asSummaryModel(api, input, output, "taint") + string asLiftedTaintModel(Printing::SummaryApi api, string input, string output) { + result = asModel(api, input, output, false, true) } /** * Gets the summary model for `api` with `input` and `output`. + * (1) If `preservesValue` is true a "value" model is created. + * (2) If `lift` is true the model is lifted to the best possible type. */ bindingset[input, output, preservesValue] - string asModel(Printing::SummaryApi api, string input, string output, boolean preservesValue) { + string asModel( + Printing::SummaryApi api, string input, string output, boolean preservesValue, boolean lift + ) { preservesValue = true and - result = asValueModel(api, input, output) + result = asSummaryModel(api, input, output, "value", lift) or preservesValue = false and - result = asTaintModel(api, input, output) + result = asSummaryModel(api, input, output, "taint", lift) } /**