-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
C#/Java: Content based model generation improvements. #17363
Changes from all commits
7c0101a
d2c98c8
d7e61d0
9149a17
0fbeca1
e948902
da012a7
b94940b
0abc08c
68165bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
/** | ||
* @name Capture content based summary models. | ||
* @description Finds applicable content based summary models to be used by other queries. | ||
* @kind diagnostic | ||
* @id cs/utils/modelgenerator/contentbased-summary-models | ||
* @tags modelgenerator | ||
*/ | ||
|
||
import internal.CaptureModels | ||
|
||
from DataFlowSummaryTargetApi api, string flow | ||
where flow = captureContentFlow(api) | ||
select flow order by flow |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,7 +127,7 @@ string captureQualifierFlow(DataFlowSummaryTargetApi api) { | |
api = returnNodeEnclosingCallable(ret) and | ||
isOwnInstanceAccessNode(ret) | ||
) and | ||
result = Printing::asValueModel(api, qualifierString(), "ReturnValue") | ||
result = Printing::asLiftedValueModel(api, qualifierString(), "ReturnValue") | ||
} | ||
|
||
private int accessPathLimit0() { result = 2 } | ||
|
@@ -237,7 +237,7 @@ string captureThroughFlow0( | |
input = parameterNodeAsInput(p) and | ||
output = getOutput(returnNodeExt) and | ||
input != output and | ||
result = Printing::asTaintModel(api, input, output) | ||
result = Printing::asLiftedTaintModel(api, input, output) | ||
) | ||
} | ||
|
||
|
@@ -291,26 +291,257 @@ private string getContent(PropagateContentFlow::AccessPath ap, int i) { | |
) | ||
} | ||
|
||
/** | ||
* Gets the MaD string representation of a store step access path. | ||
*/ | ||
private string printStoreAccessPath(PropagateContentFlow::AccessPath ap) { | ||
result = concat(int i | | getContent(ap, i), "" order by i) | ||
} | ||
|
||
/** | ||
* Gets the MaD string representation of a read step access path. | ||
*/ | ||
private string printReadAccessPath(PropagateContentFlow::AccessPath ap) { | ||
result = concat(int i | | getContent(ap, i), "" order by i desc) | ||
} | ||
|
||
string captureContentFlow(DataFlowSummaryTargetApi api) { | ||
/** | ||
* Holds if the access path `ap` contains a field or synthetic field access. | ||
*/ | ||
private predicate mentionsField(PropagateContentFlow::AccessPath ap) { | ||
exists(ContentSet head, PropagateContentFlow::AccessPath tail | | ||
head = ap.getHead() and | ||
tail = ap.getTail() | ||
| | ||
mentionsField(tail) or isField(head) | ||
) | ||
} | ||
|
||
private predicate apiFlow( | ||
DataFlowSummaryTargetApi api, DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads, | ||
ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores, boolean preservesValue | ||
) { | ||
PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and | ||
returnNodeExt.getEnclosingCallable() = api and | ||
p.getEnclosingCallable() = api | ||
} | ||
|
||
/** | ||
* A class of APIs relevant for modeling using content flow. | ||
* The following heuristic is applied: | ||
* Content flow is only relevant for an API, if | ||
* #content flow <= 2 * #parameters + 3 | ||
* If an API produces more content flow, it is likely that | ||
* 1. Types are not sufficiently constrained leading to a combinatorial | ||
* explosion in dispatch and thus in the generated summaries. | ||
* 2. It is a reasonable approximation to use the non-content based flow | ||
* detection instead, as reads and stores would use a significant | ||
* part of an objects internal state. | ||
*/ | ||
private class ContentDataFlowSummaryTargetApi extends DataFlowSummaryTargetApi { | ||
ContentDataFlowSummaryTargetApi() { | ||
count(string input, string output | | ||
exists( | ||
DataFlow::ParameterNode p, PropagateContentFlow::AccessPath reads, | ||
ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath stores | ||
| | ||
apiFlow(this, p, reads, returnNodeExt, stores, _) and | ||
input = parameterNodeAsContentInput(p) + printReadAccessPath(reads) and | ||
output = getContentOutput(returnNodeExt) + printStoreAccessPath(stores) | ||
) | ||
) <= 2 * this.getNumberOfParameters() + 3 | ||
} | ||
} | ||
|
||
pragma[nomagic] | ||
private predicate apiContentFlow( | ||
ContentDataFlowSummaryTargetApi api, DataFlow::ParameterNode p, | ||
PropagateContentFlow::AccessPath reads, ReturnNodeExt returnNodeExt, | ||
PropagateContentFlow::AccessPath stores, boolean preservesValue | ||
) { | ||
PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and | ||
returnNodeExt.getEnclosingCallable() = api and | ||
p.getEnclosingCallable() = api | ||
} | ||
|
||
/** | ||
* Holds if any of the content sets in `path` translates into a synthetic field. | ||
*/ | ||
private predicate hasSyntheticContent(PropagateContentFlow::AccessPath path) { | ||
exists(PropagateContentFlow::AccessPath tail, ContentSet head | | ||
head = path.getHead() and | ||
tail = path.getTail() | ||
| | ||
exists(getSyntheticName(head)) or | ||
hasSyntheticContent(tail) | ||
) | ||
} | ||
|
||
/** | ||
* A module containing predicates for validating access paths containing content sets | ||
* that translates into synthetic fields, when used for generated summary models. | ||
*/ | ||
private module AccessPathSyntheticValidation { | ||
/** | ||
* Holds if there exists an API that has content flow from `read` (on type `t1`) | ||
* to `store` (on type `t2`). | ||
*/ | ||
private predicate step( | ||
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store | ||
) { | ||
exists(DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt | | ||
p.getType() = t1 and | ||
returnNodeExt.getType() = t2 and | ||
apiContentFlow(_, p, read, returnNodeExt, store, _) | ||
) | ||
} | ||
|
||
/** | ||
* Holds if there exists an API that has content flow from `read` (on type `t1`) | ||
* to `store` (on type `t2`), where `read` does not have synthetic content and `store` does. | ||
* | ||
* Step A -> Synth. | ||
*/ | ||
private predicate synthPathEntry( | ||
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store | ||
) { | ||
not hasSyntheticContent(read) and | ||
hasSyntheticContent(store) and | ||
step(t1, read, t2, store) | ||
} | ||
|
||
/** | ||
* Holds if there exists an API that has content flow from `read` (on type `t1`) | ||
* to `store` (on type `t2`), where `read` has synthetic content | ||
* and `store` does not. | ||
* | ||
* Step Synth -> A. | ||
*/ | ||
private predicate synthPathExit( | ||
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store | ||
) { | ||
hasSyntheticContent(read) and | ||
not hasSyntheticContent(store) and | ||
step(t1, read, t2, store) | ||
} | ||
|
||
/** | ||
* Holds if there exists a path of steps from `read` to an exit. | ||
* | ||
* read ->* Synth -> A | ||
*/ | ||
private predicate reachesSynthExit(Type t, PropagateContentFlow::AccessPath read) { | ||
synthPathExit(t, read, _, _) | ||
or | ||
hasSyntheticContent(read) and | ||
exists(PropagateContentFlow::AccessPath mid, Type midType | | ||
hasSyntheticContent(mid) and | ||
step(t, read, midType, mid) and | ||
reachesSynthExit(midType, mid.reverse()) | ||
) | ||
} | ||
|
||
/** | ||
* Holds if there exists a path of steps from an entry to `store`. | ||
* | ||
* A -> Synth ->* store | ||
*/ | ||
private predicate synthEntryReaches(Type t, PropagateContentFlow::AccessPath store) { | ||
synthPathEntry(_, _, t, store) | ||
or | ||
hasSyntheticContent(store) and | ||
exists(PropagateContentFlow::AccessPath mid, Type midType | | ||
hasSyntheticContent(mid) and | ||
step(midType, mid, t, store) and | ||
synthEntryReaches(midType, mid.reverse()) | ||
) | ||
} | ||
|
||
/** | ||
* Holds if at least one of the access paths `read` (on type `t1`) and `store` (on type `t2`) | ||
* contain content that will be translated into a synthetic field, when being used in | ||
* a MaD summary model, and if there is a range of APIs, such that | ||
* when chaining their flow access paths, there exists access paths `A` and `B` where | ||
* A ->* read -> store ->* B and where `A` and `B` do not contain content that will | ||
* be translated into a synthetic field. | ||
* | ||
* This is needed because we don't want to include summaries that reads from or | ||
* stores into a "dead" synthetic field. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "internal" instead of "dead"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure! I will add a commit to the re-factor PR that changes this. |
||
* | ||
* Example: | ||
* Assume we have a type `t` (in this case `t1` = `t2`) with methods `getX` and | ||
* `setX`, which gets and sets a private field `X` on `t`. | ||
* This would lead to the following content flows | ||
* getX : Argument[this].SyntheticField[t.X] -> ReturnValue. | ||
* setX : Argument[0] -> Argument[this].SyntheticField[t.X] | ||
* As the reads and stores are on synthetic fields we should only make summaries | ||
* if both of these methods exist. | ||
*/ | ||
pragma[nomagic] | ||
predicate acceptReadStore( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if it is simply enough to say that we will include a synthetic field if it is part of some input specification and part of some output specification. That should also handle cases such as
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also thought about that as a possible "improvement" (instead of using "path" equality we could "hash" paths with synthetics (basically just print the synthetics in the order they are shown in the path and then compare that)). This would allow path continuations as the one you mention there. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was actually thinking of something even simpler where the order is not taken into account, but yes follow-up is fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could also try without order; However, I am quite sure that we need the "chaining" logic. It turned out that it is not uncommen that we produce summaries like |
||
Type t1, PropagateContentFlow::AccessPath read, Type t2, PropagateContentFlow::AccessPath store | ||
) { | ||
synthPathEntry(t1, read, t2, store) and reachesSynthExit(t2, store.reverse()) | ||
or | ||
exists(PropagateContentFlow::AccessPath store0 | store0.reverse() = read | | ||
synthEntryReaches(t1, store0) and synthPathExit(t1, read, t2, store) | ||
or | ||
synthEntryReaches(t1, store0) and | ||
step(t1, read, t2, store) and | ||
reachesSynthExit(t2, store.reverse()) | ||
) | ||
} | ||
} | ||
|
||
/** | ||
* Holds, if the API `api` has relevant flow from `read` on `p` to `store` on `returnNodeExt`. | ||
* Flow is considered relevant, | ||
* 1. If `read` or `store` do not contain a content set that translates into a synthetic field. | ||
* 2. If `read` or `store` contain a content set that translates into a synthetic field, and if | ||
* the synthetic content is "live" on the relevant declaring type. | ||
*/ | ||
private predicate apiRelevantContentFlow( | ||
ContentDataFlowSummaryTargetApi api, DataFlow::ParameterNode p, | ||
PropagateContentFlow::AccessPath read, ReturnNodeExt returnNodeExt, | ||
PropagateContentFlow::AccessPath store, boolean preservesValue | ||
) { | ||
apiContentFlow(api, p, read, returnNodeExt, store, preservesValue) and | ||
( | ||
not hasSyntheticContent(read) and not hasSyntheticContent(store) | ||
or | ||
AccessPathSyntheticValidation::acceptReadStore(p.getType(), read, returnNodeExt.getType(), store) | ||
) | ||
} | ||
|
||
pragma[nomagic] | ||
private predicate captureContentFlow0( | ||
|
||
ContentDataFlowSummaryTargetApi api, string input, string output, boolean preservesValue, | ||
boolean lift | ||
) { | ||
exists( | ||
DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt, string input, string output, | ||
PropagateContentFlow::AccessPath reads, PropagateContentFlow::AccessPath stores, | ||
boolean preservesValue | ||
DataFlow::ParameterNode p, ReturnNodeExt returnNodeExt, PropagateContentFlow::AccessPath reads, | ||
PropagateContentFlow::AccessPath stores | ||
| | ||
PropagateContentFlow::flow(p, reads, returnNodeExt, stores, preservesValue) and | ||
returnNodeExt.getEnclosingCallable() = api and | ||
apiRelevantContentFlow(api, p, reads, returnNodeExt, stores, preservesValue) and | ||
input = parameterNodeAsContentInput(p) + printReadAccessPath(reads) and | ||
output = getContentOutput(returnNodeExt) + printStoreAccessPath(stores) and | ||
input != output and | ||
result = Printing::asModel(api, input, output, preservesValue) | ||
(if mentionsField(reads) or mentionsField(stores) then lift = false else lift = true) | ||
) | ||
} | ||
|
||
/** | ||
* Gets the content based summary model(s) of the API `api` (if there is flow from a parameter to | ||
* the return value or a parameter). | ||
* | ||
* Models are lifted to the best type in case the read and store access paths do not | ||
* contain a field or synthetic field access. | ||
*/ | ||
string captureContentFlow(ContentDataFlowSummaryTargetApi api) { | ||
exists(string input, string output, boolean lift, boolean preservesValue | | ||
captureContentFlow0(api, input, output, _, lift) and | ||
preservesValue = max(boolean p | captureContentFlow0(api, input, output, p, lift)) and | ||
result = Printing::asModel(api, input, output, preservesValue, lift) | ||
) | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alternatively, we could make the restriction per parameter. I.e., if a method has only one summary for
Argument[0]
then include it, but if it has more than, say 3, summaries forArgument[1]
then exclude those.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that could also be interesting.
Would it be acceptable if I do that experiment as a follow up?
As it is now we "only" exclude approximately 2% of all the possible target APIs with this limitation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes!