From 89167da177d3409cb71167bbde7932fca7ed3c8e Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Mon, 2 Dec 2024 21:01:50 +0000 Subject: [PATCH 1/9] Model flow steps for lxml --- .../ql/lib/semmle/python/frameworks/Lxml.qll | 153 ++++++++++++++++-- 1 file changed, 143 insertions(+), 10 deletions(-) diff --git a/python/ql/lib/semmle/python/frameworks/Lxml.qll b/python/ql/lib/semmle/python/frameworks/Lxml.qll index fe229d66f3ae..056e1d6b68a3 100644 --- a/python/ql/lib/semmle/python/frameworks/Lxml.qll +++ b/python/ql/lib/semmle/python/frameworks/Lxml.qll @@ -8,6 +8,7 @@ private import python private import semmle.python.dataflow.new.DataFlow +private import semmle.python.dataflow.new.TaintTracking private import semmle.python.Concepts private import semmle.python.ApiGraphs private import semmle.python.frameworks.data.ModelsAsData @@ -68,16 +69,7 @@ module Lxml { */ class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode { XPathCall() { - exists(API::Node parseResult | - parseResult = - etreeRef().getMember(["parse", "fromstring", "fromstringlist", "HTML", "XML"]).getReturn() - or - // TODO: lxml.etree.parseid()[0] will contain the root element from parsing - // but we don't really have a way to model that nicely. - parseResult = etreeRef().getMember("XMLParser").getReturn().getMember("close").getReturn() - | - this = parseResult.getMember("xpath").getACall() - ) + this.(DataFlow::MethodCallNode).calls([Element::instance(), ElementTree::instance()], "xpath") } override DataFlow::Node getXPath() { result in [this.getArg(0), this.getArgByName("_path")] } @@ -318,4 +310,145 @@ module Lxml { override DataFlow::Node getAPathArgument() { result = this.getAnInput() } } + + /** Provides models for instances of the `lxml.etree.Element` class. */ + module Element { + /** Gets a reference to the `Element` class. */ + API::Node classRef() { result = etreeRef().getMember("Element") } + + abstract class InstanceSource extends DataFlow::LocalSourceNode { } + + /** Gets a reference to an `lxml.etree.Element` instance. */ + private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) { + t.start() and + result instanceof InstanceSource + or + exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t)) + } + + /** Gets a reference to an `lxml.etree.Element` instance. */ + DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) } + + /** An `Element` instantiated directly. */ + private class ElementInstance extends InstanceSource { + ElementInstance() { this = classRef().getACall() } + } + + /** The result of a parse operation that returns an `Element`. */ + private class ParseResult extends InstanceSource, DataFlow::MethodCallNode { + ParseResult() { + this.calls(XmlParser::instance(_), "close") + or + // TODO: `XMLID` and `parseid` returns a tuple of which the first element is an `Element` + this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getACall() + } + } + + /** An element index such as `etree.parse(...)[0]` */ + private class ElementIndex extends InstanceSource, DataFlow::Node { + ElementIndex() { this.asExpr().(Subscript).getObject() = instance().asExpr() } + } + + /** A call to a method on an `Element` that returns another `Element`. */ + private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode { + ElementMethod() { + // TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others + // an `Element` itself can be used as an iterator of its children. + this.calls(instance(), ["find", "getnext", "getprevious", "getparent"]) + } + } + + /** A call to a method on an `ElementTree` that returns an `Element`. */ + private class ElementTreeMethod extends InstanceSource, DataFlow::MethodCallNode { + ElementTreeMethod() { this.calls(ElementTree::instance(), "getroot") } + } + + /** An additional taint step from an `Element` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase */ + private class ElementTaintStep extends TaintTracking::AdditionalTaintStep { + override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { + exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() | + call.calls(nodeFrom, + // We don't consider sibling nodes to be tainted (getnext, getprevious, itersiblings) + [ + "cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator", + "getparent", "getroottree", "items", "iter", "iterancestors", "iterchildren", + "iterdescendants", "iterfind", "itertext", "keys", "values", "xpath" + ]) + ) + or + exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() | + attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "tag", "tail", "text"]) + ) + } + } + } + + /** Provides models for instances of the `lxml.etree.ElementTree` class. */ + module ElementTree { + API::Node classRef() { result = etreeRef().getMember("ElementTree") } + + abstract class InstanceSource extends DataFlow::LocalSourceNode { } + + /** Gets a reference to an `lxml.etree.ElementTree` instance.` */ + private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) { + t.start() and + result instanceof InstanceSource + or + exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t)) + } + + /** Gets a reference to an `lxml.etree.ElementTree` parsers instance. */ + DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) } + + /** An `ElementTree` instantiated directly. */ + private class ElementTreeInstance extends InstanceSource { + ElementTreeInstance() { this = classRef().getACall() } + } + + /** The result of a parst operation that returns an `ElementTree` */ + private class ParseResult extends InstanceSource, DataFlow::MethodCallNode { + ParseResult() { this = etreeRef().getMember("parse").getACall() } + } + + /** A call to a method on an `Element` that returns another `Element`. */ + private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode { + ElementMethod() { + // TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others + // an `Element` itself can be used as an iterator of its children. + this.calls(Element::instance(), "getroottree") + } + } + + /** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */ + private class ElementTaintStep extends TaintTracking::AdditionalTaintStep { + override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { + exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() | + call.calls(nodeFrom, + [ + "find", "findall", "findtext", "get", "getiterator", "getroot", "iter", "iterfind", + "xpath" + ]) + ) + or + exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() | + attr.accesses(nodeFrom, "docinfo") + ) + } + } + } + + /** A call to serialise xml to a string */ + private class XmlEncoding extends Encoding::Range, DataFlow::CallCfgNode { + XmlEncoding() { this = etreeRef().getMember("tostring").getACall() } + + override DataFlow::Node getAnInput() { + result = [this.getArg(0), this.getArgByName("element_or_tree")] + } + + override DataFlow::Node getOutput() { result = this } + + override string getFormat() { result = "XML" } + } + // TODO: ElementTree.write can write to a file-like object; should that be a flow step? + // It also can accept a filepath which could be a path injection sink. } From d2b0d7a7432e2b3884f1dadf39d71cdf6d695e15 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Wed, 4 Dec 2024 11:08:49 +0000 Subject: [PATCH 2/9] Add missing qldoc --- python/ql/lib/semmle/python/frameworks/Lxml.qll | 15 +++++++++++++-- .../ext/supported-threat-models.model.yml | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/python/ql/lib/semmle/python/frameworks/Lxml.qll b/python/ql/lib/semmle/python/frameworks/Lxml.qll index 056e1d6b68a3..4ff97867c508 100644 --- a/python/ql/lib/semmle/python/frameworks/Lxml.qll +++ b/python/ql/lib/semmle/python/frameworks/Lxml.qll @@ -387,6 +387,15 @@ module Lxml { module ElementTree { API::Node classRef() { result = etreeRef().getMember("ElementTree") } + /** + * A source of instances of `lxml.etree.ElementTree` instances, extend this class to model new instances. + * + * This can include instantiations of the class, return values from function + * calls, or a special parameter that will be set when functions are called by an external + * library. + * + * Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances. + */ abstract class InstanceSource extends DataFlow::LocalSourceNode { } /** Gets a reference to an `lxml.etree.ElementTree` instance.` */ @@ -397,7 +406,7 @@ module Lxml { exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t)) } - /** Gets a reference to an `lxml.etree.ElementTree` parsers instance. */ + /** Gets a reference to an `lxml.etree.ElementTree` instance. */ DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) } /** An `ElementTree` instantiated directly. */ @@ -439,7 +448,9 @@ module Lxml { /** A call to serialise xml to a string */ private class XmlEncoding extends Encoding::Range, DataFlow::CallCfgNode { - XmlEncoding() { this = etreeRef().getMember("tostring").getACall() } + XmlEncoding() { + this = etreeRef().getMember(["tostring", "tostringlist", "tounicode"]).getACall() + } override DataFlow::Node getAnInput() { result = [this.getArg(0), this.getArgByName("element_or_tree")] diff --git a/shared/threat-models/ext/supported-threat-models.model.yml b/shared/threat-models/ext/supported-threat-models.model.yml index 59589f50f386..dd20a30d7c97 100644 --- a/shared/threat-models/ext/supported-threat-models.model.yml +++ b/shared/threat-models/ext/supported-threat-models.model.yml @@ -4,3 +4,4 @@ extensions: extensible: threatModelConfiguration data: - ["default", true, -2147483648] # The "default" threat model is included by default + - ["all", true, 1] \ No newline at end of file From d2ed92d6d0c1617ea8b3d01aa64fe916128598c5 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Thu, 5 Dec 2024 13:45:51 +0000 Subject: [PATCH 3/9] Added tests --- .../frameworks/lxml/InlineTaintTest.expected | 0 .../frameworks/lxml/InlineTaintTest.ql | 2 + .../frameworks/lxml/taint_test.py | 103 ++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected create mode 100644 python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.ql create mode 100644 python/ql/test/library-tests/frameworks/lxml/taint_test.py diff --git a/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected b/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.ql b/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.ql new file mode 100644 index 000000000000..8524da5fe7db --- /dev/null +++ b/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.ql @@ -0,0 +1,2 @@ +import experimental.meta.InlineTaintTest +import MakeInlineTaintTest diff --git a/python/ql/test/library-tests/frameworks/lxml/taint_test.py b/python/ql/test/library-tests/frameworks/lxml/taint_test.py new file mode 100644 index 000000000000..7dd1b9455ca6 --- /dev/null +++ b/python/ql/test/library-tests/frameworks/lxml/taint_test.py @@ -0,0 +1,103 @@ +import lxml.etree as ET + +def ensure_tainted(*args): + pass + +TAINTED_STRING = "" +src = TAINTED_STRING + +def test(): + ensure_tainted( + src, # $ tainted + ET.fromstring(src), # $ tainted + ET.XML(src), # $ tainted + ET.HTML(src), # $ tainted + ET.fromstringlist([src]), # $ tainted + ET.XMLID(src), # $ tainted + ET.XMLDTD(src), # $ tainted + ) + + + parser = ET.XmlParser() + parser.feed(src) + ensure_tainted(parser.close()), # $ tainted + + parser2 = ET.get_default_parser() + parser.feed(data=src) + ensure_tainted(parser2.close()), # $ tainted + + elem = ET.XML(src) + ensure_tainted( + elem, # $ tainted + ET.tostring(elem), # $ tainted + ET.tostringlist(elem), # $ tainted + elem.attrib, # $ tainted + elem.base, # $ tainted + elem.nsmap, # $ tainted + elem.prefix, # $ tainted + elem.tag, # $ tainted + elem.tail, # $ tainted + elem.text, # $ tainted + elem[0], # $ tainted + elem[0].text, # $ tainted + elem.cssselect("b"), # $ tainted + elem.cssselect("b")[0].text, # $ tainted + elem.find("b").text, # $ tainted + elem.findall("b"), # $ tainted + list(elem.findall("b"))[0].text, # $ tainted + elem.get("at"), # $ tainted + elem.getchildren(), # $ tainted + list(elem.getchildren())[0].text, # $ tainted, + elem.getiterator(), # $ tainted + list(elem.getiterator())[0].text, # $ tainted + elem.getnext().text, # $ tainted + elem.getparent().text, # $ tainted + elem.getprevious().text, # $ tainted + elem.getroottree(), # $ tainted + elem.getroottree().getroot().text, # $ tainted + elem.items(), # $ tainted + list(elem.items())[0].text, # $ tainted + elem.iter(), # $ tainted + list(elem.iter())[0].text, # $ tainted + elem.iterancestors(), # $ tainted + list(elem.iterancestors())[0].text, # $ tainted + elem.iterchildren(), # $ tainted + list(elem.iterchildren())[0].text, # $ tainted + elem.iterdecendants(), # $ tainted + list(elem.iterdecendants())[0].text, # $ tainted + elem.iterfind(), # $ tainted + list(elem.iterfind())[0].text, # $ tainted + elem.itersiblings(), # $ tainted + list(elem.itersiblings())[0].text, # $ tainted + elem.itertext(), # $ tainted + list(elem.itertext())[0].text, # $ tainted + elem.keys(), # $ tainted + elem.values(), # $ tainted + elem.xpath("b"), # $ tainted + list(elem.xpath("b"))[0].text, # $ tainted + ) + + for ch in elem: + ensure_tainted( + ch, # $ tainted + ch.text # $ tainted + ) + + tree = ET.parse(src) + ensure_tainted( + tree, # $ tainted + tree.getroot().text, # $ tainted + tree.find("a").text, # $ tainted + tree.findall("a"), # $ tainted + list(tree.findall("a"))[0].text, # $ tainted + tree.getiterator(), # $ tainted + list(tree.getiterator())[0].text, # $ tainted + tree.iter(), # $ tainted + list(tree.iter())[0].text, # $ tainted + tree.iterfind(), # $ tainted + list(tree.iterfind())[0].text, # $ tainted + ) + + + +test() \ No newline at end of file From 29a90235e8ea39a1865b4a58b77668068a70168f Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Tue, 10 Dec 2024 17:42:22 +0000 Subject: [PATCH 4/9] Improve tests and use API graphs --- .../ql/lib/semmle/python/frameworks/Lxml.qll | 129 ++++++++++-------- .../frameworks/lxml/InlineTaintTest.expected | 4 + .../frameworks/lxml/taint_test.py | 125 ++++++++++------- 3 files changed, 156 insertions(+), 102 deletions(-) diff --git a/python/ql/lib/semmle/python/frameworks/Lxml.qll b/python/ql/lib/semmle/python/frameworks/Lxml.qll index 4ff97867c508..e12c9e696d0d 100644 --- a/python/ql/lib/semmle/python/frameworks/Lxml.qll +++ b/python/ql/lib/semmle/python/frameworks/Lxml.qll @@ -69,7 +69,7 @@ module Lxml { */ class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode { XPathCall() { - this.(DataFlow::MethodCallNode).calls([Element::instance(), ElementTree::instance()], "xpath") + this = [Element::instance(), ElementTree::instance()].getMember("xpath").getACall() } override DataFlow::Node getXPath() { result in [this.getArg(0), this.getArgByName("_path")] } @@ -201,6 +201,7 @@ module Lxml { * A call to either of: * - `lxml.etree.fromstring` * - `lxml.etree.fromstringlist` + * - * - `lxml.etree.XML` * - `lxml.etree.XMLID` * - `lxml.etree.parse` @@ -209,8 +210,10 @@ module Lxml { * See * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstring * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstringlist + * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.HTML * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XML * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XMLID + * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XMLDTDID * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parse * - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parseid */ @@ -218,14 +221,16 @@ module Lxml { string functionName; LxmlParsing() { - functionName in ["fromstring", "fromstringlist", "XML", "XMLID", "parse", "parseid"] and + functionName in [ + "fromstring", "fromstringlist", "HTML", "XML", "XMLID", "XMLDTDID", "parse", "parseid" + ] and this = etreeRef().getMember(functionName).getACall() } override DataFlow::Node getAnInput() { result in [ this.getArg(0), - // fromstring / XML / XMLID + // fromstring / HTML / XML / XMLID / XMLDTDID this.getArgByName("text"), // fromstringlist this.getArgByName("strings"), @@ -240,7 +245,8 @@ module Lxml { this.getParserArg() = XmlParser::instanceVulnerableTo(kind) or kind.isXxe() and - not exists(this.getParserArg()) + not exists(this.getParserArg()) and + not functionName = "HTML" } override predicate mayExecuteInput() { none() } @@ -314,70 +320,89 @@ module Lxml { /** Provides models for instances of the `lxml.etree.Element` class. */ module Element { /** Gets a reference to the `Element` class. */ - API::Node classRef() { result = etreeRef().getMember("Element") } - - abstract class InstanceSource extends DataFlow::LocalSourceNode { } + API::Node classRef() { result = etreeRef().getMember(["Element", "_Element"]) } - /** Gets a reference to an `lxml.etree.Element` instance. */ - private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) { - t.start() and - result instanceof InstanceSource - or - exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t)) + abstract class InstanceSource instanceof API::Node { + string toString() { result = super.toString() } } /** Gets a reference to an `lxml.etree.Element` instance. */ - DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) } + API::Node instance() { result instanceof InstanceSource } /** An `Element` instantiated directly. */ private class ElementInstance extends InstanceSource { - ElementInstance() { this = classRef().getACall() } + ElementInstance() { this = classRef().getAnInstance() } } /** The result of a parse operation that returns an `Element`. */ - private class ParseResult extends InstanceSource, DataFlow::MethodCallNode { + private class ParseResult extends InstanceSource { ParseResult() { - this.calls(XmlParser::instance(_), "close") + // TODO: The XmlParser module does not currently use API graphs + this = + [ + etreeRef().getMember("XMLParser").getAnInstance(), + etreeRef().getMember("get_default_parser").getReturn() + ].getMember("close").getReturn() or // TODO: `XMLID` and `parseid` returns a tuple of which the first element is an `Element` - this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getACall() + this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getReturn() } } - /** An element index such as `etree.parse(...)[0]` */ - private class ElementIndex extends InstanceSource, DataFlow::Node { - ElementIndex() { this.asExpr().(Subscript).getObject() = instance().asExpr() } - } - /** A call to a method on an `Element` that returns another `Element`. */ - private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode { + private class ElementMethod extends InstanceSource { ElementMethod() { - // TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others - // an `Element` itself can be used as an iterator of its children. - this.calls(instance(), ["find", "getnext", "getprevious", "getparent"]) + // an Element is an iterator of Elements + this = instance().getASubscript() + or + // methods that return an Element + this = instance().getMember(["find", "getnext", "getprevious", "getparent"]).getReturn() + or + // methods that return an iterator of Elements + this = + instance() + .getMember([ + "cssselect", "findall", "getchildren", "getiterator", "iter", "iterancestors", + "iterdecendants", "iterchildren", "itersiblings", "iterfind", "xpath" + ]) + .getReturn() + .getASubscript() } } /** A call to a method on an `ElementTree` that returns an `Element`. */ - private class ElementTreeMethod extends InstanceSource, DataFlow::MethodCallNode { - ElementTreeMethod() { this.calls(ElementTree::instance(), "getroot") } + private class ElementTreeMethod extends InstanceSource { + ElementTreeMethod() { + this = ElementTree::instance().getMember(["getroot", "find"]).getReturn() + or + this = + ElementTree::instance() + .getMember(["findall", "getiterator", "iter", "iterfind", "xpath"]) + .getReturn() + .getASubscript() + } } /** An additional taint step from an `Element` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase */ private class ElementTaintStep extends TaintTracking::AdditionalTaintStep { override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { - exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() | + exists(DataFlow::MethodCallNode call | + nodeTo = call and instance().asSource().flowsTo(nodeFrom) + | call.calls(nodeFrom, - // We don't consider sibling nodes to be tainted (getnext, getprevious, itersiblings) + // We consider a node to be tainted if there could be taint anywhere in the element tree + // So sibling nodes (e.g. `getnext`) are also tainted + // This ensures nodes like `elem[0].getnext()` are tracked. [ "cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator", - "getparent", "getroottree", "items", "iter", "iterancestors", "iterchildren", - "iterdescendants", "iterfind", "itertext", "keys", "values", "xpath" + "getnext", "getparent", "getprevious", "getroottree", "items", "iter", + "iterancestors", "iterchildren", "iterdescendants", "itersiblings", "iterfind", + "itertext", "keys", "values", "xpath" ]) ) or - exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() | - attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "tag", "tail", "text"]) + exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) | + attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "prefix", "tag", "tail", "text"]) ) } } @@ -385,7 +410,7 @@ module Lxml { /** Provides models for instances of the `lxml.etree.ElementTree` class. */ module ElementTree { - API::Node classRef() { result = etreeRef().getMember("ElementTree") } + API::Node classRef() { result = etreeRef().getMember(["ElementTree", "_ElementTree"]) } /** * A source of instances of `lxml.etree.ElementTree` instances, extend this class to model new instances. @@ -396,42 +421,34 @@ module Lxml { * * Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances. */ - abstract class InstanceSource extends DataFlow::LocalSourceNode { } - - /** Gets a reference to an `lxml.etree.ElementTree` instance.` */ - private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) { - t.start() and - result instanceof InstanceSource - or - exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t)) + abstract class InstanceSource instanceof API::Node { + string toString() { result = super.toString() } } /** Gets a reference to an `lxml.etree.ElementTree` instance. */ - DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) } + API::Node instance() { result instanceof InstanceSource } /** An `ElementTree` instantiated directly. */ private class ElementTreeInstance extends InstanceSource { - ElementTreeInstance() { this = classRef().getACall() } + ElementTreeInstance() { this = classRef().getAnInstance() } } /** The result of a parst operation that returns an `ElementTree` */ - private class ParseResult extends InstanceSource, DataFlow::MethodCallNode { - ParseResult() { this = etreeRef().getMember("parse").getACall() } + private class ParseResult extends InstanceSource { + ParseResult() { this = etreeRef().getMember("parse").getReturn() } } /** A call to a method on an `Element` that returns another `Element`. */ - private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode { - ElementMethod() { - // TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others - // an `Element` itself can be used as an iterator of its children. - this.calls(Element::instance(), "getroottree") - } + private class ElementMethod extends InstanceSource { + ElementMethod() { this = Element::instance().getMember("getroottree").getReturn() } } /** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */ private class ElementTaintStep extends TaintTracking::AdditionalTaintStep { override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { - exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() | + exists(DataFlow::MethodCallNode call | + nodeTo = call and instance().asSource().flowsTo(nodeFrom) + | call.calls(nodeFrom, [ "find", "findall", "findtext", "get", "getiterator", "getroot", "iter", "iterfind", @@ -439,7 +456,7 @@ module Lxml { ]) ) or - exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() | + exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) | attr.accesses(nodeFrom, "docinfo") ) } diff --git a/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected b/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected index e69de29bb2d1..366de37b8677 100644 --- a/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected +++ b/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected @@ -0,0 +1,4 @@ +argumentToEnsureNotTaintedNotMarkedAsSpurious +untaintedArgumentToEnsureTaintedNotMarkedAsMissing +testFailures +failures diff --git a/python/ql/test/library-tests/frameworks/lxml/taint_test.py b/python/ql/test/library-tests/frameworks/lxml/taint_test.py index 7dd1b9455ca6..2a6f9938c199 100644 --- a/python/ql/test/library-tests/frameworks/lxml/taint_test.py +++ b/python/ql/test/library-tests/frameworks/lxml/taint_test.py @@ -1,36 +1,37 @@ import lxml.etree as ET +import io def ensure_tainted(*args): - pass + print("ensure_tainted: ", *args) -TAINTED_STRING = "" +TAINTED_STRING = "" src = TAINTED_STRING def test(): ensure_tainted( src, # $ tainted - ET.fromstring(src), # $ tainted - ET.XML(src), # $ tainted - ET.HTML(src), # $ tainted - ET.fromstringlist([src]), # $ tainted - ET.XMLID(src), # $ tainted - ET.XMLDTD(src), # $ tainted + ET.fromstring(src), # $ tainted decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.fromstring(..) + ET.XML(src), # $ tainted decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XML(..) + ET.HTML(src), # $ tainted decodeFormat=XML decodeInput=src decodeOutput=ET.HTML(..) + ET.fromstringlist([src]), # $ tainted decodeFormat=XML decodeInput=List xmlVuln='XXE' decodeOutput=ET.fromstringlist(..) + ET.XMLID(src), # $ tainted decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XMLID(..) + ET.XMLDTDID(src), # $ tainted decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XMLDTDID(..) ) - parser = ET.XmlParser() - parser.feed(src) - ensure_tainted(parser.close()), # $ tainted + parser = ET.XMLParser() + parser.feed(src) # $ decodeFormat=XML decodeInput=src xmlVuln='XXE' + ensure_tainted(parser.close()), # $ tainted decodeOutput=parser.close() parser2 = ET.get_default_parser() - parser.feed(data=src) - ensure_tainted(parser2.close()), # $ tainted + parser2.feed(data=src) # $ decodeFormat=XML decodeInput=src xmlVuln='XXE' + ensure_tainted(parser2.close()), # $ tainted decodeOutput=parser2.close() - elem = ET.XML(src) + elem = ET.XML(src) # $ decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XML(..) ensure_tainted( elem, # $ tainted - ET.tostring(elem), # $ tainted - ET.tostringlist(elem), # $ tainted + ET.tostring(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tostring(..) + ET.tostringlist(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tostringlist(..) elem.attrib, # $ tainted elem.base, # $ tainted elem.nsmap, # $ tainted @@ -44,60 +45,92 @@ def test(): elem.cssselect("b")[0].text, # $ tainted elem.find("b").text, # $ tainted elem.findall("b"), # $ tainted - list(elem.findall("b"))[0].text, # $ tainted + elem.findall("b")[0].text, # $ tainted + list(elem.findall("b"))[0].text, # $ MISSING:tainted # Type tracking is not followed through call to `list`, elem.get("at"), # $ tainted - elem.getchildren(), # $ tainted - list(elem.getchildren())[0].text, # $ tainted, + elem.getchildren()[0].text, # $ tainted elem.getiterator(), # $ tainted - list(elem.getiterator())[0].text, # $ tainted - elem.getnext().text, # $ tainted - elem.getparent().text, # $ tainted - elem.getprevious().text, # $ tainted + next(elem.getiterator()).text, # $ MISSING:tainted + elem[0].getnext().text, # $ tainted + elem[0].getparent().text, # $ tainted + elem[1].getprevious().text, # $ tainted elem.getroottree(), # $ tainted elem.getroottree().getroot().text, # $ tainted elem.items(), # $ tainted - list(elem.items())[0].text, # $ tainted elem.iter(), # $ tainted - list(elem.iter())[0].text, # $ tainted + next(elem.iter()).text, # $ MISSING:tainted elem.iterancestors(), # $ tainted - list(elem.iterancestors())[0].text, # $ tainted - elem.iterchildren(), # $ tainted - list(elem.iterchildren())[0].text, # $ tainted - elem.iterdecendants(), # $ tainted - list(elem.iterdecendants())[0].text, # $ tainted - elem.iterfind(), # $ tainted - list(elem.iterfind())[0].text, # $ tainted - elem.itersiblings(), # $ tainted - list(elem.itersiblings())[0].text, # $ tainted + next(elem[0].iterancestors()).text, # $ MISSING:tainted + elem.iterchildren(), # $ tainted + next(elem.iterchildren()).text, # $ MISSING:tainted + elem.iterdescendants(), # $ tainted + next(elem.iterdescendants()).text, # $ MISSING:tainted + elem.iterfind("b"), # $ tainted + next(elem.iterfind("b")).text, # $ MISSING:tainted + elem[0].itersiblings(), # $ tainted + next(elem[0].itersiblings()).text, # $ MISSING:tainted elem.itertext(), # $ tainted - list(elem.itertext())[0].text, # $ tainted elem.keys(), # $ tainted elem.values(), # $ tainted - elem.xpath("b"), # $ tainted - list(elem.xpath("b"))[0].text, # $ tainted + elem.xpath("b")[0].text, # $ tainted getXPath="b" ) for ch in elem: ensure_tainted( ch, # $ tainted - ch.text # $ tainted + ch.text # $ MISSING: tainted # API node getASubscript() appears to not consider things like for loops ) - tree = ET.parse(src) + buf = io.StringIO(src) + tree = ET.parse(buf) # $ decodeFormat=XML decodeInput=buf xmlVuln='XXE' decodeOutput=ET.parse(..) SPURIOUS:getAPathArgument=buf # Spurious as this is used as a file-like objectt, not a path ensure_tainted( tree, # $ tainted tree.getroot().text, # $ tainted - tree.find("a").text, # $ tainted - tree.findall("a"), # $ tainted - list(tree.findall("a"))[0].text, # $ tainted + tree.find("b").text, # $ tainted + tree.findall("b")[0].text, # $ tainted tree.getiterator(), # $ tainted - list(tree.getiterator())[0].text, # $ tainted + next(tree.getiterator()).text, # $ MISSING:tainted tree.iter(), # $ tainted - list(tree.iter())[0].text, # $ tainted - tree.iterfind(), # $ tainted - list(tree.iterfind())[0].text, # $ tainted + next(tree.iter()).text, # $ MISSING:tainted + tree.iterfind("b"), # $ tainted + next(tree.iterfind("b")).text, # $ MISSING:tainted ) + (elem2, ids) = ET.XMLID(src) # $ decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XMLID(..) + (elem3, ids2) = ET.XMLDTDID(src) # $ decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XMLDTDID(..) + + ensure_tainted( + elem2, # $ tainted + elem3, # $ tainted + ids, # $ tainted + ids2, # $ tainted + elem2.text, # $ MISSING:tainted # Type is not tracked to the tuple return value + elem3.text, # $ MISSING:tainted + ) + + buf = io.StringIO(src) + (tree2,ids3) = ET.parseid(buf) # $ decodeFormat=XML decodeInput=buf xmlVuln='XXE' decodeOutput=ET.parseid(..) SPURIOUS:getAPathArgument=buf + + ensure_tainted( + tree2, # $ tainted + ids3, # $ tainted + tree2.getroot() # $ MISSING:tainted + ) + + buf = io.BytesIO(bytes(src, "utf8")) + for ev, el in ET.iterparse(buf): # $ decodeFormat=XML decodeInput=buf xmlVuln='XXE' decodeOutput=ET.iterparse(..) SPURIOUS:getAPathArgument=buf + ensure_tainted( + el, # $ tainted + el.text, # $ MISSING:tainted + ) + + def func(tree_arg: ET.ElementTree): + ensure_tainted( + tree_arg, # $ tainted + tree_arg.getroot().text # $ tainted # Type tracking from the type hint + ) + + func(tree2) test() \ No newline at end of file From bcb08bbc7bac93cd30afdaf306325f48792374b4 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Tue, 10 Dec 2024 19:24:05 +0000 Subject: [PATCH 5/9] Update test output --- .../test/library-tests/frameworks/lxml/InlineTaintTest.expected | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected b/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected index 366de37b8677..020c338fd192 100644 --- a/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected +++ b/python/ql/test/library-tests/frameworks/lxml/InlineTaintTest.expected @@ -1,4 +1,3 @@ argumentToEnsureNotTaintedNotMarkedAsSpurious untaintedArgumentToEnsureTaintedNotMarkedAsMissing testFailures -failures From 5c8ef28d1272f9038802f12bf9591bb83edf07ce Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Wed, 11 Dec 2024 11:54:01 +0000 Subject: [PATCH 6/9] Add missing qldoc and revert accidentilly commited threat model change --- python/ql/lib/semmle/python/frameworks/Lxml.qll | 12 ++++++++++++ .../ext/supported-threat-models.model.yml | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/python/ql/lib/semmle/python/frameworks/Lxml.qll b/python/ql/lib/semmle/python/frameworks/Lxml.qll index e12c9e696d0d..872230fddb55 100644 --- a/python/ql/lib/semmle/python/frameworks/Lxml.qll +++ b/python/ql/lib/semmle/python/frameworks/Lxml.qll @@ -322,7 +322,17 @@ module Lxml { /** Gets a reference to the `Element` class. */ API::Node classRef() { result = etreeRef().getMember(["Element", "_Element"]) } + /** + * A source of instances of `lxml.etree.Element` instances, extend this class to model new instances. + * + * This can include instantiations of the class, return values from function + * calls, or a special parameter that will be set when functions are called by an external + * library. + * + * Use the predicate `Element::instance()` to get references to instances of `lxml.etree.ElementTree` instances. + */ abstract class InstanceSource instanceof API::Node { + /** Gets a textual representation of this element. */ string toString() { result = super.toString() } } @@ -410,6 +420,7 @@ module Lxml { /** Provides models for instances of the `lxml.etree.ElementTree` class. */ module ElementTree { + /** Gets a reference to the `ElementTree` class. */ API::Node classRef() { result = etreeRef().getMember(["ElementTree", "_ElementTree"]) } /** @@ -422,6 +433,7 @@ module Lxml { * Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances. */ abstract class InstanceSource instanceof API::Node { + /** Gets a textual representation of this element. */ string toString() { result = super.toString() } } diff --git a/shared/threat-models/ext/supported-threat-models.model.yml b/shared/threat-models/ext/supported-threat-models.model.yml index dd20a30d7c97..59589f50f386 100644 --- a/shared/threat-models/ext/supported-threat-models.model.yml +++ b/shared/threat-models/ext/supported-threat-models.model.yml @@ -4,4 +4,3 @@ extensions: extensible: threatModelConfiguration data: - ["default", true, -2147483648] # The "default" threat model is included by default - - ["all", true, 1] \ No newline at end of file From 2019ddfa7ffec6040c227fd1811a98b84d002ec1 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Wed, 11 Dec 2024 12:25:40 +0000 Subject: [PATCH 7/9] Qldoc improvements + add a few extra tests --- .../ql/lib/semmle/python/frameworks/Lxml.qll | 21 ++++++++++++------- .../frameworks/lxml/taint_test.py | 4 +++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/python/ql/lib/semmle/python/frameworks/Lxml.qll b/python/ql/lib/semmle/python/frameworks/Lxml.qll index 872230fddb55..529adb98f86c 100644 --- a/python/ql/lib/semmle/python/frameworks/Lxml.qll +++ b/python/ql/lib/semmle/python/frameworks/Lxml.qll @@ -69,6 +69,8 @@ module Lxml { */ class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode { XPathCall() { + // TODO: lxml.etree.parseid()[0] will contain the root element from parsing + // but we don't really have a way to model that nicely. this = [Element::instance(), ElementTree::instance()].getMember("xpath").getACall() } @@ -201,9 +203,10 @@ module Lxml { * A call to either of: * - `lxml.etree.fromstring` * - `lxml.etree.fromstringlist` - * - + * - `lxml.etree.HTML` * - `lxml.etree.XML` * - `lxml.etree.XMLID` + * - `lxml.etree.XMLDTDID` * - `lxml.etree.parse` * - `lxml.etree.parseid` * @@ -329,7 +332,7 @@ module Lxml { * calls, or a special parameter that will be set when functions are called by an external * library. * - * Use the predicate `Element::instance()` to get references to instances of `lxml.etree.ElementTree` instances. + * Use the predicate `Element::instance()` to get references to instances of `lxml.etree.Element` instances. */ abstract class InstanceSource instanceof API::Node { /** Gets a textual representation of this element. */ @@ -354,7 +357,8 @@ module Lxml { etreeRef().getMember("get_default_parser").getReturn() ].getMember("close").getReturn() or - // TODO: `XMLID` and `parseid` returns a tuple of which the first element is an `Element` + // TODO: `XMLID`, `XMLDTDID`, `parseid` returns a tuple of which the first element is an `Element`. + // `iterparse` returns an iterator of tuples, each of which has a second element that is an `Element`. this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getReturn() } } @@ -393,15 +397,18 @@ module Lxml { } } - /** An additional taint step from an `Element` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase */ + /** + * An additional taint step from an `Element` instance. + * See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase. + */ private class ElementTaintStep extends TaintTracking::AdditionalTaintStep { override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { exists(DataFlow::MethodCallNode call | nodeTo = call and instance().asSource().flowsTo(nodeFrom) | call.calls(nodeFrom, - // We consider a node to be tainted if there could be taint anywhere in the element tree - // So sibling nodes (e.g. `getnext`) are also tainted + // We consider a node to be tainted if there could be taint anywhere in the element tree; + // So sibling nodes (e.g. `getnext`) are also tainted. // This ensures nodes like `elem[0].getnext()` are tracked. [ "cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator", @@ -445,7 +452,7 @@ module Lxml { ElementTreeInstance() { this = classRef().getAnInstance() } } - /** The result of a parst operation that returns an `ElementTree` */ + /** The result of a parst operation that returns an `ElementTree`. */ private class ParseResult extends InstanceSource { ParseResult() { this = etreeRef().getMember("parse").getReturn() } } diff --git a/python/ql/test/library-tests/frameworks/lxml/taint_test.py b/python/ql/test/library-tests/frameworks/lxml/taint_test.py index 2a6f9938c199..d978d855d88d 100644 --- a/python/ql/test/library-tests/frameworks/lxml/taint_test.py +++ b/python/ql/test/library-tests/frameworks/lxml/taint_test.py @@ -32,6 +32,7 @@ def test(): elem, # $ tainted ET.tostring(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tostring(..) ET.tostringlist(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tostringlist(..) + ET.tounicode(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tounicode(..) elem.attrib, # $ tainted elem.base, # $ tainted elem.nsmap, # $ tainted @@ -82,7 +83,7 @@ def test(): ) buf = io.StringIO(src) - tree = ET.parse(buf) # $ decodeFormat=XML decodeInput=buf xmlVuln='XXE' decodeOutput=ET.parse(..) SPURIOUS:getAPathArgument=buf # Spurious as this is used as a file-like objectt, not a path + tree = ET.parse(buf) # $ decodeFormat=XML decodeInput=buf xmlVuln='XXE' decodeOutput=ET.parse(..) SPURIOUS:getAPathArgument=buf # Spurious as this is used as a file-like object, not a path ensure_tainted( tree, # $ tainted tree.getroot().text, # $ tainted @@ -94,6 +95,7 @@ def test(): next(tree.iter()).text, # $ MISSING:tainted tree.iterfind("b"), # $ tainted next(tree.iterfind("b")).text, # $ MISSING:tainted + tree.xpath("b")[0].text, # $ tainted getXPath="b" ) (elem2, ids) = ET.XMLID(src) # $ decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XMLID(..) From e6794a9af1ad4dd5a788a08600f674ffcb919c27 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Wed, 11 Dec 2024 14:27:25 +0000 Subject: [PATCH 8/9] Add change note --- python/ql/lib/change-notes/2024-12-11-lxml-flowsteps.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 python/ql/lib/change-notes/2024-12-11-lxml-flowsteps.md diff --git a/python/ql/lib/change-notes/2024-12-11-lxml-flowsteps.md b/python/ql/lib/change-notes/2024-12-11-lxml-flowsteps.md new file mode 100644 index 000000000000..0ceaf914c361 --- /dev/null +++ b/python/ql/lib/change-notes/2024-12-11-lxml-flowsteps.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +- Additional taint steps through methods of `lxml.etree.Element` and `lxml.etree.ElementTree` objects from the `lxml` PyPI package have been modeled. \ No newline at end of file From dcbcf7e2bd4a25d951de65460b83035f97204141 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Thu, 12 Dec 2024 15:55:36 +0000 Subject: [PATCH 9/9] Add additional tests demonstrating false negative flow --- .../library-tests/frameworks/lxml/taint_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/ql/test/library-tests/frameworks/lxml/taint_test.py b/python/ql/test/library-tests/frameworks/lxml/taint_test.py index d978d855d88d..437a41acbaf0 100644 --- a/python/ql/test/library-tests/frameworks/lxml/taint_test.py +++ b/python/ql/test/library-tests/frameworks/lxml/taint_test.py @@ -1,5 +1,6 @@ import lxml.etree as ET import io +import typing def ensure_tainted(*args): print("ensure_tainted: ", *args) @@ -133,6 +134,21 @@ def func(tree_arg: ET.ElementTree): ) func(tree2) + + def func2(x): + return x + + def func3(x) -> ET.Element: + return x + + ensure_tainted( + func2(tree), # $ tainted + func2(tree).text, # $ MISSING:tainted - type tracking not tracked through flow preserving calls + func3(tree).text, # $ MISSING:tainted - this includes if there is a type hint annotation on the return + typing.cast(ET.ElementTree, tree), # $ tainted + typing.cast(ET.ElementTree, tree).text, # $ MISSING:tainted - this includes for flow summary models + + ) test() \ No newline at end of file