diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..64bb1b0 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,23 @@ +language: python +python: + - "2.7" + +# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors +before_install: + - sudo apt-get update -qq + - sudo apt-get install -y python-lxml python-pip +install: + # here we don't use the travis virtualenv because that + # causes us to install lxml again below, doubling the build time + # when it compiles the native modules from source + - deactivate + - sudo pip install xml2rfc + +# command to run tests, e.g. python setup.py test +script: make all #upload + +env: + global: + - GH_REF: github.com/juberti/draughts.git + - secure: "Dyu6BRI5Gyidgnshtz4qNvDtXfGLhsoOH9KIyGpk+3RgDYpE2t0uL2D1oWAr7oNbgzHuBpEkd4HaSigs0Yu5UgmQXvhwjBO/ChrleX9g4lVx7qjkOGEz94o6B/FI/ygqmQ819V4CyldZkSYyAJSaL0/OanwuKZ6CejKpCyJYJeo=" + diff --git a/nombis/Makefile b/Makefile similarity index 89% rename from nombis/Makefile rename to Makefile index 2ecd315..8aa71e6 100644 --- a/nombis/Makefile +++ b/Makefile @@ -9,8 +9,9 @@ DIF := #$(patsubst %.xml,%.diff.html,$(SRC)) PDF := #$(patsubst %.xml,%.pdf,$(SRC)) SVG := #$(patsubst %.wsd,%.svg,$(WSRC)) -#all: $(HTML) $(TXT) $(DIF) $(PDF) -all: $(HTML) $(TXT) $(DIF) $(SVG) $(PDF) +all: $(HTML) $(TXT) +# $(DIF) $(PDF) +#all: $(HTML) $(TXT) $(DIF) $(SVG) $(PDF) clean: rm -f *~ draft*.html draft*pdf draft-*txt $(SVG) diff --git a/README.md b/README.md index fe5dfe0..0fbdd11 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Build Status](https://travis-ci.org/juberti/draughts.svg)](https://travis-ci.org/juberti/draughts) + draughts ======== diff --git a/fec/draft-uberti-rtcweb-fec.xml b/draft-ietf-rtcweb-fec.xml similarity index 65% rename from fec/draft-uberti-rtcweb-fec.xml rename to draft-ietf-rtcweb-fec.xml index 06cbcfa..308d8ea 100755 --- a/fec/draft-uberti-rtcweb-fec.xml +++ b/draft-ietf-rtcweb-fec.xml @@ -5,7 +5,7 @@ - + ]> @@ -19,7 +19,7 @@ - + WebRTC Forward Error Correction Requirements @@ -43,22 +43,22 @@ - + RAI - This document makes recommendations for how Forward Error Correction - (FEC) should be used by WebRTC applications. + This document provides information and requirements for how + Forward Error Correction (FEC) should be used by WebRTC applications.
- In situations where packet loss is high, or media quality must be perfect, + In situations where packet loss is high, or perfect media quality is essential, Forward Error Correction (FEC) can be used to proactively recover from packet losses. - This document describes what FEC mechanisms should be used by WebRTC - client implementations. + This specification provides guidance on which FEC mechanisms to use, and + how to use them, for WebRTC client implementations.
@@ -75,7 +75,8 @@ can be accomplished; this section enumerates the various mechanisms and describes their tradeoffs.
- This approach, as described in , Section 4.3, sends FEC packets as an + This approach, as described in , Section 4.3, + sends FEC packets as an independent SSRC-multiplexed stream, with its own SSRC and payload type. While by far the most flexible, each FEC packet will have its own IP+UDP+RTP+FEC header, leading to additional overhead of the FEC stream. @@ -83,14 +84,16 @@
This approach, as descibed in , allows for redundant data - to be piggybacked on an existing primary encoding in a single packet. + to be piggybacked on an existing primary encoding, all in a single packet. This redundant data may be an exact copy of a previous packet, or for codecs that support variable-bitrate encodings, possibly a smaller, - lower-quality representation. Since there is only a single set of - packet headers, this allows for a very efficient representation of + lower-quality representation. In certain cases, the redundant data + could include multiple prior packets. + Since there is only a single set of + packet headers, this approach allows for a very efficient representation of primary + redundant data. However, this savings is only realized when - the two encodings both fit into a single packet (i.e. less than a MTU). - This approach is also only applicable to audio content. + the data all fits into a single packet (i.e. the size is less than a MTU). + As a result, this approach is generally not useful for video content.
@@ -104,42 +107,37 @@
- The following section provides guidance on how to best use FEC for transmitting + The following section provides guidance on how to best use FEC for transmitting audio data. As indicated in below, FEC should only be activated if network conditions warrant it, or upon explicit application request.
When using the Opus codec in its default (hybrid) mode, use of - the built-in Opus FEC mechanism is RECOMMENDED. - This provides reasonable protection of the - audio stream against typical losses, with moderate overhead. [TODO: add stats] - Note though that this mechanism only protects the SILK layer of the Opus codec; - the CELT portion is not protected. This is not an issue when Opus is running in - hybrid mode, as the lower frequencies will still be able to be recovered, - with minimal quality impact. + the built-in Opus FEC mechanism is RECOMMENDED. This provides reasonable + protection of the audio stream against typical losses, with minimal + overhead. [TODO: add stats] - When using Opus in CELT mode, or other variable-bitrate codecs, - use of redundant encoding with a lower-fidelity version of the - previous packet is RECOMMENDED. When using Opus specifically, the lower-fidelity - version can simply be a truncated version of the previous Opus packet. - [TODO: decide exact truncated size] - This provides reasonable protection of the payload with minimal overhead. + When using variable-bitrate codecs without an internal FEC, use of + redundant encoding with a lower-fidelity version + of previous packet(s) is RECOMMENDED. + This provides reasonable protection of the payload with moderate overhead. - When using constant-bitrate codecs, e.g. PCMU, use of redundant encoding - is NOT RECOMMENDED, as this will result in a potentially significant bitrate increase. - Furthermore, suddenly increasing the bitrate to deal with packet losses may + When using constant-bitrate codecs, e.g. PCMU, use of + redundant encoding MAY be used, but note that + this will result in a potentially significant bitrate increase, and + that suddenly increasing bitrate to deal with losses from congestion may actually make things worse. - Because of the lower packet rate of audio encodings, usually a single packet per - frame, use of a separate FEC stream comes with a higher overhead than other - mechanisms, and therefore is NOT RECOMMENDED. + Because of the lower packet rate of audio encodings, usually a single + packet per frame, use of a separate FEC stream comes with a higher overhead + than other mechanisms, and therefore is NOT RECOMMENDED.
Support for redundant encoding can be indicated by offering "red" as a supported payload type in the offer. Answerers can reject the use of - redundant encoding by not including "red" as a supported payload type in + redundant encoding by not including "red" as a supported payload type in the answer. Support for codec-specific FEC mechanisms are typically indicated via "a=fmtp" parameters. For Opus specifically, this is controlled by the @@ -150,13 +148,13 @@
- The following section provides guidance on how to best use FEC for transmitting + The following section provides guidance on how to best use FEC for transmitting video data. As indicated in below, FEC should only be activated if network conditions warrant it, or upon explicit application request.
For video content, use of a separate FEC stream with the RTP payload format - described in is RECOMMENDED. + described in is RECOMMENDED. The receiver can demultiplex the incoming FEC stream by SSRC and correlate it with the primary stream via the ssrc-group mechanism. @@ -166,22 +164,36 @@
To offer support for a separate FEC stream, the offerer MUST offer one of - the formats described in , Section 5.1, as well as a + the formats described in , + Section 5.1, as well as a ssrc-group with "FEC-FR" semantics as described in , Section 4.3. Answerers can reject the use of FEC by not including FEC payloads in the answer. -
+
+
+ +
+ WebRTC also supports the ability to send generic application data, and + provides transport-level retransmission mechanisms that the application + can use to ensure that its data is delivered reliably. + + Because the application can control exactly what data to send, it has the + ability to monitor packet statistics and perform its own application-level + FEC, if necessary. + As a result, this document makes no recommendations regarding FEC for + the underlying data transport.
To support the functionality recommended above, implementations MUST support - the redundant encoding mechanism described in and the FEC - mechanism described in and . - Implementations MAY support additional FEC mechanisms if desired, + the redundant encoding mechanism described in and the FEC + mechanism described in and + . + Implementations MAY support additional FEC mechanisms if desired, e.g. .
- +
- Since use of FEC causes redundant data to be transmitted, this will + Since use of FEC causes redundant data to be transmitted, this will lead to less bandwidth available for the primary encoding, when in a bandwidth-constrained environment. Given this, WebRTC implementations SHOULD only transmit FEC data when network conditions indicate that @@ -192,7 +204,17 @@
- TODO + This document makes recommendations regarding the use of FEC. + Generally, it should be noted that although applying redundancy is often useful + in protecting a stream against packet loss, if the loss is caused by network + congestion, the additional bandwidth used by the redundant data may actually + make the situation worse, and can lead to significant degradation of the + network. + + Additional security considerations for each individual FEC mechanism + are enumerated in their respective documents. + +
@@ -200,7 +222,8 @@
- Several people provided significant input into this document, including Jonathan Lennox, Giri Mandyam, Varun Singh, Tim Terriberry, and Mo Zanaty. + Several people provided significant input into this document, including + Jonathan Lennox, Giri Mandyam, Varun Singh, Tim Terriberry, and Mo Zanaty.
@@ -219,6 +242,15 @@
+ Changes in draft -01: + Tweaked abstract/intro text that was ambiguously normative. + Removed text on FEC for Opus in CELT mode. + Changed RFC 2198 recommendation for PCMU to be MAY instead of + NOT RECOMMENDED, based on list feedback. + Explicitly called out application data as something not addressed + in this document. + Updated flexible-fec reference. + Changes in draft -00: Initial version, from sidebar conversation at IETF 90. diff --git a/draft-uberti-behave-turn-rest.xml b/draft-uberti-behave-turn-rest.xml new file mode 100755 index 0000000..d99a2f6 --- /dev/null +++ b/draft-uberti-behave-turn-rest.xml @@ -0,0 +1,246 @@ + + + + + + + + + + + + + + + A REST API For Access To TURN Services + + + Google +
+ + 747 6th St S + Kirkland + WA + 98033 + USA + + justin@uberti.name +
+
+ + + +This document describes a proposed standard REST API for obtaining access to TURN services via ephemeral (i.e. time-limited) credentials. These credentials are vended by a web service over HTTP, and then supplied to and checked by a TURN server using the standard TURN protocol. The usage of ephemeral credentials ensures that access to the TURN server can be controlled even if the credentials can be discovered by the user, as is the case in WebRTC where TURN credentials must be specified in Javascript. + + +
+ +
+
+ + TURN is a protocol that is often used to improve the connectivity of P2P applications. By providing a cloud-based relay service, TURN ensures that a connection can be established even when one or both sides is incapable of a direct P2P connection. However, as a relay service, it imposes a nontrivial cost on the service provider. Therefore, access to a TURN service is almost always access-controlled. + + + TURN provides a mechanism to control access via long-term credentials that are provided to the server as part of the TURN protocol. However, use of traditional login usernames and passwords for these credentials poses several problems, many of which are outlined in , Section 4. The list below summarizes the major issues: + + The TURN password must be kept secret, which is difficult or impossible for web applications. + The TURN password is vulnerable to offline dictionary attacks against MESSAGE-INTEGRITY. + The TURN server must consult a password database to verify MESSAGE-INTEGRITY, increasing the complexity of the TURN server. + The TURN username value is passed in the clear, and can be used to determine who is talking to whom. + + +
+
+ + To address these problems, this document proposes an API that can be used to retrieve ephemeral TURN credentials from a web service. These credentials can then be used as long-term credentials with a standard TURN server, requring only a custom authentication module. For simplicity, the design has been kept intentionally stateless; the only interaction needed between the web service and the TURN service is to share a secret key. Because of this, the authentication check can be done entirely on the TURN server without needing to consult any external service. + + This design directly addresses the issues mentioned above: + + The TURN password is ephemeral (i.e. time-limited), so even if it is discovered, its usefulness is limited. + The TURN password is machine-generated with high entropy, making it resistant to dictionary attacks. + The TURN server can validate the password trivially, using a pre-configured secret key. + The TURN username value does not need to contain any externally-identifying information. + + + + Note that while also supports the concept of short-term credentials, they are not useful for TURN. Since a nonce is not used with short-term credentials, they are completely vulnerable to replay attacks. + +
+
+
+ +To retrieve a new set of credentials, the client makes a HTTP GET request, specifying TURN as the service to allocate credentials for, and optionally specifying a user id parameter. The purpose of the user id parameter is to simplify debugging on the TURN server, as well as provide the ability to control the number of credentials handed out for a specific user, if desired. The TURN credentials and their lifetime are returned as JSON, along with URIs that indicate how to connect to the server using the TURN protocol. + + +To avoid the need for state passing between the web service and TURN server, the returned credentials consist of a TURN username that encodes all the necessary state (expiry time and application user id), and a TURN password that is a digest of this state, signed with the shared secret key. + + +Since the returned credentials are ephemeral, they will eventually expire. This does not affect existing TURN allocations, as they are tied to a specific 5-tuple, but requests to allocate new TURN ports will fail after the expiry time. This is significant in the case of an ICE restart, where the client will need to allocate a new set of candidates, including TURN candidates. To get a new set of ephemeral credentials, the client can simply re-issue the original HTTP request with the same parameters, which will return the new credentials in its JSON response. + + +To prevent unauthorized use, the HTTP requests can be ACLed by various means, e.g. IP address (if coming from a server), Origin header, User-Agent header, login cookie, API key, etc. + +
+ + The request includes the following parameters, specified in the URL: + +service: specifies the desired service (turn) +username: an optional user id to be associated with the credentials +key: if an API key is used for authentication, the API key + + + +Example: +
+ +GET /?service=turn&username=mbzrxpgjys + +
+
+
+
+ +The response is returned with content-type "application/json", and consists of a JSON object with the following parameters: + +username: the TURN username to use, which is a colon-delimited combination of the expiration timestamp and the username parameter from the request (if specified). The timestamp is intended to be opaque to the web application, so its format is arbitrary, but for simplicity, use of UNIX timestamps is recommended. +password: the TURN password to use; this value is computed from the a secret key shared with the TURN server and the returned username value, by performing base64(hmac(secret key, returned username)). HMAC-SHA1 is one HMAC algorithm that can be used, but any algorithm that incorporates a shared secret is acceptable, as long as both the web server and TURN server use the same algorithm and secret. +ttl: the duration for which the username and password are valid, in seconds. A value of one day (86400 seconds) is recommended. +uris: an array of TURN URIs, in the form specified in . This is used to indicate the different addresses and/or protocols that can be used to reach the TURN server. These URIs SHOULD specify a hostname, IPv4, or IPv6 address for the TURN server, as well as the port and transport to use; this avoids the need for a DNS SRV or S-NAPTR lookup as specified in . + + + +Example: +
+ + { + "username" : "12334939:mbzrxpgjys", + "password" : "adfsaflsjfldssia", + "ttl" : 86400, + "uris" : [ + "turn:1.2.3.4:9991?transport=udp", + "turn:1.2.3.4:9992?transport=tcp", + "turns:1.2.3.4:443?transport=tcp" + ] +} + +
+
+
+
+
+ +The returned JSON is parsed into an RTCIceServer object, and supplied as part of the RTCConfiguration object that is used when creating a RTCPeerConnection. + + +Example: +
+ +var iceServer = { + "username": response.username, + "credential": response.password, + "uris": response.uris +}; +var config = {"iceServers": [iceServer]}; +var pc = new RTCPeerConnection(config); + +
+
+ +When the credentials are updated (e.g. because they are about to expire), a new RTCConfiguration with the updated credentials can be supplied to the existing RTCPeerConnection via the updateIce method. This update must not affect existing TURN allocations, because TURN requires that the username stay constant for an allocation, but the new credentials will be used for any new allocations. + + +[TODO: make sure this behavior is specified in the W3C API spec] + +
+
+
+ +The WebRTC client will perform a standard TURN allocation sequence using the long-term credentials mechanism specified in , Section 10.2, using the "username" value from the returned JSON for its USERNAME attribute, and the "password" value for the password input to the MESSAGE-INTEGRITY hash. + +
+
+The TURN server will process the request using the long-term credentials mechanism specifed in . Note that the REALM value supplied by the server is not meaningful in this context, and can be set to any valid value. + + When processing ALLOCATE requests, the TURN server MUST split the USERNAME attribute into its timestamp and user id components, and verify that the timestamp, which indicates when the credentials expire, has not yet been reached. If this verification fails, it SHOULD reject the request with a 401 (Unauthorized) error. + + +If desired, the TURN server can optionally verify that the parsed user id value corresponds to a currently valid user of an external service (e.g. is currently logged in to the web app that is making use of TURN). This requires proprietary communication between the TURN server and external service on each ALLOCATE request, and is not necessary for typical applications. If this external verification fails, it SHOULD reject the request with a 401 (Unauthorized) error. + + +For non-ALLOCATE requests, the TURN server merely verifies that the USERNAME matches the USERNAME that was used in the ALLOCATE (since it must remain constant). + + +As in RFC 5766, the TURN server MUST verify the MESSAGE-INTEGRITY using the password associated with the supplied USERNAME. For the usage outlined in this document, the password will always be constructed using the supplied username and the shared secret as indicated in the "HTTP Interactions" section above. Because the password is derived from the USERNAME, successful verification of the MESSAGE-INTEGRITY ensures that the USERNAME (and the expiration time contained within) is trustworthy. + +
+
+
+ + The diagram below shows the complete set of interactions between a client application, the TURN REST API, and the actual TURN server. + +
+ + | | + |<---200 OK, JSON body------| | + | username: | | + | password: | | + | uris:["example.com:3478?transport=udp"] | + | | | + |----ALLOCATE Request------------------------------>| + |<---ALLOCATE Error Response------------------------| + | NONCE= | | + | REALM="" | | + | ERROR_CODE=401 | | + |----ALLOCATE Request------------------------------>| + | USERNAME= | | + | NONCE= | | + | REALM="" | | + | MESSAGE_INTEGRITY= | + |<---ALLOCATE Response------------------------------| + ]]> + +
+
+
+
+ + In the system as described here, revoking specific credentials is not possible. The assumption is that TURN services are of low enough value that waiting for the timeout to expire is a valid approach for dealing with possibly-compromised credentials. + + +In extreme abuse cases, TURN server blacklists of timestamp+username values can be supplied by an administrator to stop abuse of specific credential sets. + +
+
+ +As indicated in , periodic rotation of the shared secret to protect against key compromise is RECOMMENDED. To facilitate the rollover, the TURN server SHOULD be able to validate incoming MESSAGE-INTEGRITY tokens based on at least 2 shared secrets at any time. + +
+
+
+ Because the USERNAME values in a TURN ALLOCATE request are typically visible to eavesdroppers, inclusion of an externally identifying user id, such as a login name, may allow a passive attacker to determine the identities of the parties in a conversation. To prevent this problem, use of opaque user id values is recommended. + This mechanism assumes that the clocks of the web server and TURN server are roughly in sync. Given the expected large TTLs for the vended credentials, clock skew on the order of seconds to minutes should not cause an issue. However, if the TURN server's clock was mistakenly set to a date signficantly in the past, credentials could be accepted for far longer than their intended lifetime. +
+
+ None. +
+
+ Harald Alvestrand, Alfred Godoy, and Philipp Hancke provided key input on the initial design. Dave Cridland, Cullen Jennings, Oleg Moskalenko, and Matthew Robertson pointed out several errors and omissions. +
+
+ + + + + + + + + + + + + +
diff --git a/nombis/draft-uberti-mmusic-nombis.xml b/draft-uberti-mmusic-nombis.xml similarity index 71% rename from nombis/draft-uberti-mmusic-nombis.xml rename to draft-uberti-mmusic-nombis.xml index 66690ce..28849cf 100755 --- a/nombis/draft-uberti-mmusic-nombis.xml +++ b/draft-uberti-mmusic-nombis.xml @@ -6,6 +6,7 @@ + ]> @@ -63,7 +64,7 @@ - + RAI @@ -78,7 +79,7 @@ Interactive Connectivity Establishment (ICE) attempts to find the 'best' path for connectivity between two peers; in ICE parlance, these paths are known as 'candidate pairs'. During the ICE process, one endpoint, - known as the 'controlling' endpoint, selects a candidate pair as the + known as the 'controlling' endpoint, selects a candidate pair as the best pair; this action is known as nomination. ICE supports two different mechanisms for performing nomination, known as Regular Nomination, and Aggressive Nomination. @@ -91,14 +92,14 @@ Needless to say, the presence of both modes also adds nontrivial complexity. Lastly, ICE is currently defined as a finite process, where the decision on the optimal candidate pair is made during call setup and infrequently (if ever) - changed. While this may be acceptable for endpoints with static network + changed. While this may be acceptable for endpoints with static network configurations, it fails to meet the needs of mobile endpoints, who may need to seamlessly move between networks, or be connected to multiple networks simultaneously. In these cases, the controlling endpoint may want to maintain - multiple potential candidate pairs, and make dynamic decisions to switch + multiple potential candidate pairs, and make dynamic decisions to switch between them as conditions change. - To address these challenges, this document makes two proposals for - refactoring ICE nomination - merging Regular and Aggressive Nomination, and + To address these challenges, this document makes two proposals for + refactoring ICE nomination - merging Regular and Aggressive Nomination, and introducing a new mode, known as Continuous Nomination. This makes ICE substantially more flexible without increasing complexity. @@ -125,7 +126,7 @@
While an ICE endpoint will assign various priority values to its ICE candidates, - these priorities are static and can only be based on a priori knowledge; the + these priorities are static and can only be based on a priori knowledge; the shortcomings of this approach are discussed in the first paragraph of Section 2.6 in . To properly make choices in multi-network and multi-server scenarios, the controlling @@ -134,7 +135,7 @@ which TURN servers to use, as described in To ensure symmetric flows, this implies that the controlling endpoint MUST be able to communicate its choice to the controlled - side. + side.
@@ -144,16 +145,16 @@ warm and switch immediately if connectivity is interrupted on one of them. As the signaling channel may be affected by the event necessitating the switch, this implies that the controlling endpoint MUST be able to change the selected pair - and indicate this to the remote side without signaling. The need for this + and indicate this to the remote side without signaling. The need for this functionality has been stated in and . The rules in ensure that the controlled endpoint keeps its candidate needed for the selected pair alive. However, - in order for alternate pairs to remain available, the controlled endpoint + in order for alternate pairs to remain available, the controlled endpoint must keep the associated candidates alive as well, following the procedures outlined in , Section 4.1.1.4. This implies - that the controlling endpoint MUST have some way to + that the controlling endpoint MUST have some way to indicate to the controlled side that specific candidates are to be kept alive.
@@ -165,7 +166,7 @@ remote side. While this could be done using an ICE restart, as described in , Section 9.1, the ICE restart may have unintended consequences, such as causing the remote side to regather all candidates. - Instead, it would be best if the new candidates could be trickled, as + Instead, it would be best if the new candidates could be trickled, as discussed in , but even after ICE processing has completed. @@ -180,7 +181,7 @@
Increased functionality typically leads to increased complexity, which leads to more edge cases, and more implementation bugs. This suggests that - in addition to proposing new ICE functionality, the ideal solution SHOULD + in addition to proposing new ICE functionality, the ideal solution SHOULD deprecate superfluous functionality.
@@ -200,7 +201,7 @@ as in Aggressive Nomination, the requirements of and can be met, while meeting the compatibility requirement from and, since - Aggressive Nomination is no longer needed, the complexity requirement from + Aggressive Nomination is no longer needed, the complexity requirement from . @@ -237,12 +238,60 @@ nomination completes. [N.B. this could be the ice-options:continuous option described below] +
+ An example call setup using Regular Nomination as described above + is shown here. Alice is in the controlling role, and Bob is in the + controlled role; Alice has a single host candidate and Bob has + both host and relay candidates. + Alice's initial check to Bob's + host candidate fails, but the check to his relay candidate succeeds, + so Alice starts transmitting media on her host-relay pair. Bob's + initial check from his host candidate to Alice's host candidate + succeeds, so he starts transmitting media over this host-host pair to + Alice. However, when Alice's host check is later retransmitted, + it succeeds, and Alice determines that the host-host pair has a better + RTT than the host-relay pair, so she cuts media over to use the + host-host pair. Eventually, Alice concludes Regular Nomination by + sending a final check to Bob with the USE-CANDIDATE flag set. If Bob + had selected a different pair to use than Alice, this action would + have forced Bob to use the same pair. + +
| + |(2) STUN Res (Bob host) | | + | Lost|<----------------------------| + |(3) STUN Req (Bob relay) | | + |---------------------------------------------------------->| + |(4) STUN Res (Bob relay) | | + |<----------------------------------------------------------| + |(5) RTP starts (Bob relay) | | + |==========================================================>| + |(6) STUN Req (Alice host) | | + |<----------------------------------------------------------| + |(7) STUN Res (Alice host) | | + |---------------------------------------------------------->| + |(8) RTP starts (Alice host) | | + |<==========================================================| + |(9) STUN Req (Bob host) | | + |---------------------------------------------------------->| + |(10) STUN Req (Bob host) | | + |<----------------------------------------------------------| + |(11) RTP switch (Bob host) | | + |==========================================================>| + |(12) STUN Req (Bob host, U-C)| | + |---------------------------------------------------------->| + |(13) STUN Res (Bob host) | | + |<----------------------------------------------------------| +]]>
+
As discussed above, in mobile environments there can be multiple possible - valid candidate pairs, and these can change at various points in the call, + valid candidate pairs, and these can change at various points in the call, as new interfaces go up and down, signal strength for wireless interfaces changes, and new relay servers are discovered. However, under 5245 rules, once a candidate pair is selected and confirmed, via @@ -258,16 +307,16 @@ may be impacted by the handover.
Under continuous nomination, ICE never concludes; new candidates can - always be trickled, and a new candidate pair can be selected by the + always be trickled, and a new candidate pair can be selected by the controlling side at any time. When selecting a new candidate pair, the controlling side informs the - controlled side of the chosen pat by sending a new Binding Request + controlled side of the chosen path by sending a new Binding Request with a USE-CANDIDATE attribute. The decision about which candidate pair to use is fully dynamic; the controlling side can use metrics such as RTT or loss rate to change the selected pair at any time. If Binding Requests need to be sent for any other reason, such as consent checks - [TODO: reference], any checks sent on the selected pair MUST include a USE-CANDIDATE - attribute. + , any checks sent on + the selected pair MUST include a USE-CANDIDATE attribute. Upon receipt of a Binding Request with USE-CANDIDATE, the controlled side MUST switch its media path to the candidate pair on which the Binding Request @@ -275,9 +324,11 @@ may be impacted by the handover. During continuous nomination, the controlling side may still elect to prune certain candidate pairs; for example, an implementation may choose to drop relay candidates once a successful connection has been - established. The controlled side, however, should follow the + established. The controlled side, however, should follow the controlling side's lead in terms of deciding whether any pairs should - be pruned. The controlling ICE Agent informs the remote side of its + be pruned. [TODO: should the controlled side have any say in the matter, + e.g. to eliminate certain candidates?] + The controlling ICE Agent informs the remote side of its preferences by continuing to send Binding Requests to the remote side on each candidate pair that it wants to retain. The controlled ICE Agent SHOULD prune any @@ -287,11 +338,12 @@ may be impacted by the handover. approach is correct, or if we should have some sort of approach similar to TURN LIFETIME indicating when a pair should be GCed, with LIFETIME==0 indicating immediate GC.] One side benefit of doing this - is that the continuous exchange of Binding Requests across all + is that the continuous exchange of Binding Requests across all candidate pairs allows the RTT and loss rate for each to be reliably determined and kept up to date. If the endpoints have negotiated Trickle ICE support - [TODO: reference], and new candidates become available on either side, + , and new candidates + become available on either side, the endpoint may send these candidates to the remote side using the existing Trickle ICE mechanisms. Once all of the new candidates have been transmitted, the endpoint MUST send an end-of-candidates @@ -302,18 +354,62 @@ may be impacted by the handover. continuous nomination sequence, and upon successful completion, discarding all candidates from the previous nomination sequence. -
+
Since standard ICE implementations may not expect the selected pair to change after a USE-CANDIDATE attribute is received, support for continuous nomination is explicitly indicated via a new "continuous" value for ice-options. If the remote side does not support the -"continuous" option, the controlling side MUST fall back to Regular Nomination, as +"continuous" option, the controlling side MUST fall back to Regular Nomination, as specified in , Sectiom 8.1.1.
- - -
- TODO +
+
+ Alice and Bob have set up a call using ICE and have established + multiple valid pairs. The currently selected pair is for a + peer-to-peer route, as it had the highest initial priority value. + However, they have also kept alive a selected pair that goes + through their TURN servers. At a certain point, Alice detects, via + the connectivity checks that she continues to do on the relayed pair, + that it actually has a better RTT than the peer-to-peer path. She + then decides to switch media over to this path. + As mentioned above, this is easily handled by Alice immediately + switching her media to the relayed path; future STUN checks on this + path also include the USE-CANDIDATE attribute. +
+
+ Alice and Bob have set up a call using ICE, and are currently + sending their media through Alice's TURN server. At a certain point, + Alice's application discovers a new TURN server that it thinks + might provide a better path for this call. + Alice gathers new candidates from this TURN server, and trickles + them to Bob. They perform connectivity checks using these candidates, + and Alice determines that the RTT when going through this TURN server is + better than the RTT of the current relayed path. + As in the previous example, this is easily handled by Alice switching + media to the new path, along with sending USE-CANDIDATE. If the old + path is no longer needed, Alice can destroy the allocation on the + old TURN server, and Bob will stop checking it when it stops working. + +
+
+ Alice and Bob have set up a call using ICE, and are currently + exchanging their media directly via a peer-to-peer path. Alice is + on a mobile device, with both wifi and cellular interfaces, but for + power reasons, candidates have only been gathered on the wifi + interface. At a certain point, Alice leaves her home while the call + is active. + In response to the decreasing wifi signal strength, Alice starts + to collect candidates on the cellular interface, and trickles them + to Bob. They perform connectivity checks using these candidates, + and, because of the low wifi signal strength, these candidates are + preferred over the existing selected pair. + As in the previous examples, Alice can easily switch media to the + new selected pair. When Alice walks completely out of wifi range, + and the wifi interface goes down, the wifi candidates are pruned, + and any valid pairs on Bob's side that use those candidates will + time out and be pruned as well. +
+
@@ -322,13 +418,14 @@ indicated via a new "continuous" value for ice-options. If the remote side does
A new ICE option "continuous" has been [will be] registered - in the "ICE Options" registry created by . + in the "ICE Options" registry created by .
Several people provided significant input into this document, - including Martin Thomson, Brandon Williams, and Dan Wing. + including Martin Thomson, Brandon Williams, and Dan Wing. Emil Ivov also + provided several of the examples for continuous nomination.
@@ -340,6 +437,7 @@ indicated via a new "continuous" value for ice-options. If the remote side does + &consent; &trickle; &redirect; &icemob; diff --git a/fec/draft-uberti-rtcweb-fec.txt b/fec/draft-uberti-rtcweb-fec.txt deleted file mode 100644 index 8de9916..0000000 --- a/fec/draft-uberti-rtcweb-fec.txt +++ /dev/null @@ -1,392 +0,0 @@ - - - - -Network Working Group J. Uberti -Internet-Draft Google -Intended status: Standards Track October 27, 2014 -Expires: April 30, 2015 - - - WebRTC Forward Error Correction Requirements - draft-uberti-rtcweb-fec-00 - -Abstract - - This document makes recommendations for how Forward Error Correction - (FEC) should be used by WebRTC applications. - -Status of This Memo - - This Internet-Draft is submitted in full conformance with the - provisions of BCP 78 and BCP 79. - - Internet-Drafts are working documents of the Internet Engineering - Task Force (IETF). Note that other groups may also distribute - working documents as Internet-Drafts. The list of current Internet- - Drafts is at http://datatracker.ietf.org/drafts/current/. - - Internet-Drafts are draft documents valid for a maximum of six months - and may be updated, replaced, or obsoleted by other documents at any - time. It is inappropriate to use Internet-Drafts as reference - material or to cite them other than as "work in progress." - - This Internet-Draft will expire on April 30, 2015. - -Copyright Notice - - Copyright (c) 2014 IETF Trust and the persons identified as the - document authors. All rights reserved. - - This document is subject to BCP 78 and the IETF Trust's Legal - Provisions Relating to IETF Documents - (http://trustee.ietf.org/license-info) in effect on the date of - publication of this document. Please review these documents - carefully, as they describe your rights and restrictions with respect - to this document. Code Components extracted from this document must - include Simplified BSD License text as described in Section 4.e of - the Trust Legal Provisions and are provided without warranty as - described in the Simplified BSD License. - - - - - - -Uberti Expires April 30, 2015 [Page 1] - -Internet-Draft WebRTC FEC October 2014 - - -Table of Contents - - 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . 2 - 2. Terminology . . . . . . . . . . . . . . . . . . . . . . . . . 2 - 3. Types of FEC . . . . . . . . . . . . . . . . . . . . . . . . 2 - 3.1. Separate FEC Stream . . . . . . . . . . . . . . . . . . . 3 - 3.2. Redundant Encoding . . . . . . . . . . . . . . . . . . . 3 - 3.3. Codec-Specific In-band FEC . . . . . . . . . . . . . . . 3 - 4. FEC for Audio Content . . . . . . . . . . . . . . . . . . . . 3 - 4.1. Recommended Mechanism . . . . . . . . . . . . . . . . . . 3 - 4.2. Negotiating Support . . . . . . . . . . . . . . . . . . . 4 - 5. FEC for Video Content . . . . . . . . . . . . . . . . . . . . 4 - 5.1. Recommended Mechanism . . . . . . . . . . . . . . . . . . 4 - 5.2. Negotiating Support . . . . . . . . . . . . . . . . . . . 5 - 6. Implementation Requirements . . . . . . . . . . . . . . . . . 5 - 7. Adaptive Use of FEC . . . . . . . . . . . . . . . . . . . . . 5 - 8. Security Considerations . . . . . . . . . . . . . . . . . . . 5 - 9. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 5 - 10. Acknowledgements . . . . . . . . . . . . . . . . . . . . . . 5 - 11. References . . . . . . . . . . . . . . . . . . . . . . . . . 6 - 11.1. Normative References . . . . . . . . . . . . . . . . . . 6 - 11.2. Informative References . . . . . . . . . . . . . . . . . 6 - Appendix A. Change log . . . . . . . . . . . . . . . . . . . . . 6 - Author's Address . . . . . . . . . . . . . . . . . . . . . . . . 6 - -1. Introduction - - In situations where packet loss is high, or media quality must be - perfect, Forward Error Correction (FEC) can be used to proactively - recover from packet losses. This document describes what FEC - mechanisms should be used by WebRTC client implementations. - -2. Terminology - - The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", - "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this - document are to be interpreted as described in [RFC2119]. - -3. Types of FEC - - By its name, FEC describes the sending of redundant information in an - outgoing packet stream so that information can still be recovered - even in the face of packet loss. There are multiple ways in which - this can be accomplished; this section enumerates the various - mechanisms and describes their tradeoffs. - - - - - - -Uberti Expires April 30, 2015 [Page 2] - -Internet-Draft WebRTC FEC October 2014 - - -3.1. Separate FEC Stream - - This approach, as described in [RFC5956], Section 4.3, sends FEC - packets as an independent SSRC-multiplexed stream, with its own SSRC - and payload type. While by far the most flexible, each FEC packet - will have its own IP+UDP+RTP+FEC header, leading to additional - overhead of the FEC stream. - -3.2. Redundant Encoding - - This approach, as descibed in [RFC2198], allows for redundant data to - be piggybacked on an existing primary encoding in a single packet. - This redundant data may be an exact copy of a previous packet, or for - codecs that support variable-bitrate encodings, possibly a smaller, - lower-quality representation. Since there is only a single set of - packet headers, this allows for a very efficient representation of - primary + redundant data. However, this savings is only realized - when the two encodings both fit into a single packet (i.e. less than - a MTU). This approach is also only applicable to audio content. - -3.3. Codec-Specific In-band FEC - - Some audio codecs, notably Opus [RFC6716], support their own in-band - FEC mechanism, where FEC data is included in the codec payload. In - the case of Opus specifically, packets deemed as important are re- - encoded at a lower bitrate and added to the subsequent packet, - allowing partial recovery of a lost packet. See [RFC6716], - Section 2.1.7 for details. - -4. FEC for Audio Content - - The following section provides guidance on how to best use FEC for - transmitting audio data. As indicated in Section 7 below, FEC should - only be activated if network conditions warrant it, or upon explicit - application request. - -4.1. Recommended Mechanism - - When using the Opus codec in its default (hybrid) mode, use of the - built-in Opus FEC mechanism is RECOMMENDED. This provides reasonable - protection of the audio stream against typical losses, with moderate - overhead. [TODO: add stats] Note though that this mechanism only - protects the SILK layer of the Opus codec; the CELT portion is not - protected. This is not an issue when Opus is running in hybrid mode, - as the lower frequencies will still be able to be recovered, with - minimal quality impact. - - - - - -Uberti Expires April 30, 2015 [Page 3] - -Internet-Draft WebRTC FEC October 2014 - - - When using Opus in CELT mode, or other variable-bitrate codecs, use - of [RFC2198] redundant encoding with a lower-fidelity version of the - previous packet is RECOMMENDED. When using Opus specifically, the - lower-fidelity version can simply be a truncated version of the - previous Opus packet. [TODO: decide exact truncated size] This - provides reasonable protection of the payload with minimal overhead. - - When using constant-bitrate codecs, e.g. PCMU, use of [RFC2198] - redundant encoding is NOT RECOMMENDED, as this will result in a - potentially significant bitrate increase. Furthermore, suddenly - increasing the bitrate to deal with packet losses may actually make - things worse. - - Because of the lower packet rate of audio encodings, usually a single - packet per frame, use of a separate FEC stream comes with a higher - overhead than other mechanisms, and therefore is NOT RECOMMENDED. - -4.2. Negotiating Support - - Support for redundant encoding can be indicated by offering "red" as - a supported payload type in the offer. Answerers can reject the use - of redundant encoding by not including "red" as a supported payload - type in the answer. - - Support for codec-specific FEC mechanisms are typically indicated via - "a=fmtp" parameters. For Opus specifically, this is controlled by - the "useinbandfec=1" parameter, as specified in - [I-D.ietf-payload-rtp-opus]. These parameters are declarative and - can be negotiated separately for either media direction. - -5. FEC for Video Content - - The following section provides guidance on how to best use FEC for - transmitting video data. As indicated in Section 7 below, FEC should - only be activated if network conditions warrant it, or upon explicit - application request. - -5.1. Recommended Mechanism - - For video content, use of a separate FEC stream with the RTP payload - format described in [I-D.singh-payload-rtp-1d2d-parity-scheme] is - RECOMMENDED. The receiver can demultiplex the incoming FEC stream by - SSRC and correlate it with the primary stream via the ssrc-group - mechanism. - - Note that this only allows the FEC stream to protect a single primary - stream. Support for protecting multiple primary streams with a - - - - -Uberti Expires April 30, 2015 [Page 4] - -Internet-Draft WebRTC FEC October 2014 - - - single FEC stream is complicated by WebRTC's 1-m-line-per-stream - policy and requires further study. - -5.2. Negotiating Support - - To offer support for a separate FEC stream, the offerer MUST offer - one of the formats described in - [I-D.singh-payload-rtp-1d2d-parity-scheme], Section 5.1, as well as a - ssrc-group with "FEC-FR" semantics as described in [RFC5956], - Section 4.3. - - Answerers can reject the use of FEC by not including FEC payloads in - the answer. - -6. Implementation Requirements - - To support the functionality recommended above, implementations MUST - support the redundant encoding mechanism described in [RFC2198] and - the FEC mechanism described in [RFC5956] and - [I-D.singh-payload-rtp-1d2d-parity-scheme]. - - Implementations MAY support additional FEC mechanisms if desired, - e.g. [RFC5109]. - -7. Adaptive Use of FEC - - Since use of FEC causes redundant data to be transmitted, this will - lead to less bandwidth available for the primary encoding, when in a - bandwidth-constrained environment. Given this, WebRTC - implementations SHOULD only transmit FEC data when network conditions - indicate that this is advisable (e.g. by monitoring transmit packet - loss data from RTCP Receiver Reports), or the application indicates - it is willing to pay a quality penalty to proactively avoid losses. - -8. Security Considerations - - TODO - -9. IANA Considerations - - This document requires no actions from IANA. - -10. Acknowledgements - - Several people provided significant input into this document, - including Jonathan Lennox, Giri Mandyam, Varun Singh, Tim Terriberry, - and Mo Zanaty. - - - - -Uberti Expires April 30, 2015 [Page 5] - -Internet-Draft WebRTC FEC October 2014 - - -11. References - -11.1. Normative References - - [I-D.singh-payload-rtp-1d2d-parity-scheme] - Singh, V., Begen, A., and M. Zanaty, "RTP Payload Format - for Non-Interleaved and Interleaved Parity Forward Error - Correction (FEC)", draft-singh-payload-rtp-1d2d-parity- - scheme-00 (work in progress), October 2014. - - [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate - Requirement Levels", BCP 14, RFC 2119, March 1997. - - [RFC2198] Perkins, C., Kouvelas, I., Hodson, O., Hardman, V., - Handley, M., Bolot, J., Vega-Garcia, A., and S. Fosse- - Parisis, "RTP Payload for Redundant Audio Data", RFC 2198, - September 1997. - - [RFC5956] Begen, A., "Forward Error Correction Grouping Semantics in - the Session Description Protocol", RFC 5956, September - 2010. - -11.2. Informative References - - [I-D.ietf-payload-rtp-opus] - Spittka, J., Vos, K., and J. Valin, "RTP Payload Format - for Opus Speech and Audio Codec", draft-ietf-payload-rtp- - opus-03 (work in progress), July 2014. - - [RFC5109] Li, A., "RTP Payload Format for Generic Forward Error - Correction", RFC 5109, December 2007. - - [RFC6716] Valin, JM., Vos, K., and T. Terriberry, "Definition of the - Opus Audio Codec", RFC 6716, September 2012. - -Appendix A. Change log - - Changes in draft -00: - - o Initial version, from sidebar conversation at IETF 90. - -Author's Address - - - - - - - - - -Uberti Expires April 30, 2015 [Page 6] - -Internet-Draft WebRTC FEC October 2014 - - - Justin Uberti - Google - 747 6th Ave S - Kirkland, WA 98033 - USA - - Email: justin@uberti.name - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Uberti Expires April 30, 2015 [Page 7] diff --git a/fec/Makefile b/lrr/Makefile similarity index 100% rename from fec/Makefile rename to lrr/Makefile diff --git a/lrr/draft-ietf-avtext-lrr-00.html b/lrr/draft-ietf-avtext-lrr-00.html new file mode 100644 index 0000000..d385cf2 --- /dev/null +++ b/lrr/draft-ietf-avtext-lrr-00.html @@ -0,0 +1,829 @@ + + + + + + + The Layer Refresh Request (LRR) RTCP Feedback Message + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Payload Working GroupJ. Lennox
Internet-DraftD. Hong
Intended status: Standards TrackVidyo
Expires: February 11, 2016J. Uberti
S. Holmer
M. Flodman
Google
August 12, 2015
+ +

The Layer Refresh Request (LRR) RTCP Feedback Message
+ draft-ietf-avtext-lrr-00

+ +

Abstract

+

This memo describes the RTP Payload-Specific Feedback Message "Layer Refresh Request" (LRR), which can be used to request a state refresh of one or more substreams of a layered media stream. It also defines its use with several scalable media formats.

+

Status of this Memo

+

This Internet-Draft is submitted in full conformance with the provisions of BCP 78 and BCP 79.

+

Internet-Drafts are working documents of the Internet Engineering Task Force (IETF). Note that other groups may also distribute working documents as Internet-Drafts. The list of current Internet- Drafts is at http://datatracker.ietf.org/drafts/current/.

+

Internet-Drafts are draft documents valid for a maximum of six months and may be updated, replaced, or obsoleted by other documents at any time. It is inappropriate to use Internet-Drafts as reference material or to cite them other than as "work in progress."

+

This Internet-Draft will expire on February 11, 2016.

+

Copyright Notice

+

Copyright (c) 2015 IETF Trust and the persons identified as the document authors. All rights reserved.

+

This document is subject to BCP 78 and the IETF Trust's Legal Provisions Relating to IETF Documents (http://trustee.ietf.org/license-info) in effect on the date of publication of this document. Please review these documents carefully, as they describe your rights and restrictions with respect to this document. Code Components extracted from this document must include Simplified BSD License text as described in Section 4.e of the Trust Legal Provisions and are provided without warranty as described in the Simplified BSD License.

+ + +
+

Table of Contents

+ + +

+1. Introduction +

+

This memo describes an RTP Payload-Specific Feedback Message [RFC4585] "Layer Refresh Request" (LRR). It is designed to allow a receiver of a layered media stream to request that one or more of its substreams be refreshed, such that it can then be decoded by an endpoint which previously was not receiving those layers, without requiring that the entire stream be refreshed (as it would be if the receiver sent a Full Intra Request (FIR) [RFC5104].

+

The message is designed to be applicable both to temporally and spatially scaled streams, and to both single-stream and multi-stream scalability modes.

+

+2. Conventions, Definitions and Acronyms +

+

The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC2119].

+

+2.1. Terminology +

+

A "Layer Refresh Point" is a point in a scalable stream after which a decoder, which previously had been able to decode only some (possibly none) of the available layers of stream, is able to decode a greater number of the layers.

+

For spatial (or quality) layers, layer refresh typically requires that a spatial layer be encoded in a way that references only lower-layer subpictures of the current picture, not any earlier pictures of that spatial layer. Additionally, the encoder must promise that no earlier pictures of that spatial layer will be used as reference in the future.

+

In a layer refresh, however, other layers than the ones requested for refresh may still maintain dependency on earlier content of the stream. This is the difference between a layer refresh and a Full Intra Request [RFC5104]. This minimizes the coding overhead of refresh to only those parts of the stream that actually need to be refreshed at any given time.

+
+
+

An illustration of spatial layer refresh of an enhancement layer is shown below.

+
+     ... <--  S1  <--  S1       S1  <--  S1  <-- ...
+               |        |        |        |
+              \/       \/       \/       \/
+     ... <--  S0  <--  S0  <--  S0  <--  S0  <-- ...
+
+               1        2        3        4
+          
+

In this illustration, frame 3 is a layer refresh point for spatial layer S1; a decoder which had previously only been decoding spatial layer S0 would be able to decode layer S1 starting at frame 3.

+
+
+

An illustration of spatial layer refresh of a base layer is shown below.

+
+     ... <--  S1  <--  S1  <--  S1  <--  S1  <-- ...
+               |        |        |        |
+              \/       \/       \/       \/
+     ... <--  S0  <--  S0       S0  <--  S0  <-- ...
+
+               1        2        3        4
+          
+

In this illustration, frame 3 is a layer refresh point for spatial layer S0; a decoder which had previously not been decoding the stream at all could decode layer S0 starting at frame 3.

+

For temporal layers, layer refresh requires that the layer be "temporally nested", i.e. use as reference only earlier frames of a lower temporal layer, not any earlier frames of this temporal layer, and also promise that no future frames of this temporal layer will reference frames of this temporal layer before the refresh point. In many cases, the temporal structure of the stream will mean that all frames are temporally nested, in which case decoders will have no need to send LRR messages for the stream.

+
+
+

An illustration of temporal layer refresh is shown below.

+
+        ...  <----- T1  <------ T1          T1  <------ ...
+                   /           /           /
+                 |_          |_          |_
+     ... <--  T0  <------ T0  <------ T0  <------ T0  <--- ...
+
+               1     2     3     4     5     6     7
+          
+

In this illustration, frame 6 is a layer refresh point for temporal layer T1; a decoder which had previously only been decoding temporal layer T0 would be able to decode layer T1 starting at frame 6.

+
+
+

An illustration of an inherently temporally nested stream is shown below.

+
+                    T1          T1          T1
+                   /           /           /
+                 |_          |_          |_
+     ... <--  T0  <------ T0  <------ T0  <------ T0  <--- ...
+
+               1     2     3     4     5     6     7
+          
+

In this illustration, the stream is temporally nested in its ordinary structure; a decoder receiving layer T0 can begin decoding layer T1 at any point.

+

+3. Layer Refresh Request +

+

A layer refresh frame can be requested by sending a Layer Refresh Request (LRR), which is an RTCP payload-specific feedback message [RFC4585] asking the encoder to encode a frame which makes it possible to upgrade to a higher layer. The LRR contains one or two tuples, indicating the layer the decoder wants to upgrade to, and (optionally) the currently highest layer the decoder can decode.

+

The specific format of the tuples, and the mechanism by which a receiver recognizes a refresh frame, is codec-dependent. Usage for several codecs is discussed in Section 4.

+

LRR follows the model of the Full Intra Request (FIR) [RFC5104](Section 3.5.1) for its retransmission, reliability, and use in multipoint conferences. TODO: expand these here.

+

The LRR message is identified by RTCP packet type value PT=PSFB and FMT=TBD. The FCI field MUST contain one or more FIR entries. Each entry applies to a different media sender, identified by its SSRC.

+

+3.1. Message Format +

+

The Feedback Control Information (FCI) for the Layer Refresh Request consists of one or more FCI entries, the content of which is depicted in Figure 5. The length of the LRR feedback message MUST be set to 2+3*N, where N is the number of FCI entries.

+
+
+
+    0                   1                   2                   3
+    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   |                              SSRC                             |
+   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   | Seq nr.       |C| Payload Type| Reserved                      |
+   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   | Target Layer Index            | Current Layer Index (opt)     |
+   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+          
+

+ +
+
SSRC (32 bits)
+
The SSRC value of the media sender that is requested to send a layer refresh point.
+
Seq nr. (8 bits)
+
Command sequence number. The sequence number space is unique for each pairing of the SSRC of command source and the SSRC of the command target. The sequence number SHALL be increased by 1 modulo 256 for each new command. A repetition SHALL NOT increase the sequence number. The initial value is arbitrary.
+
C (1 bit)
+
A flag bit indicating whether the "Current Layer Index" field is present in the FCI. If this bit is false, the sender of the LRR message is requesting refresh of all layers up to and including the target layer.
+
Payload Type (7 bits)
+
The RTP payload type for which the LRR is being requested. This gives the context in which the target layer index is to be interpreted.
+
Reserved (16 bits)
+
All bits SHALL be set to 0 by the sender and SHALL be ignored on reception.
+
Target Layer Index (16 bits)
+
The target layer for which the receiver wishes a refresh point. Its format is dependent on the payload type field.
+
Current Layer Index (16 bits)
+
If C is 1, the current layer being decoded by the receiver. This message is not requesting refresh of layers at or below this layer. If C is 0, this field SHALL be set to 0 by the sender and SHALL be ignored on reception.
+
+ +

+

+4. Usage with specific codecs +

+

+4.1. H264 SVC

+

H.264 SVC [RFC6190] defines temporal, dependency (spatial), and quality scalability modes.

+
+
+
+            +---------------+---------------+
+            |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7|
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |R| DID |  QID  | TID |RES      |
+            +---------------+---------------+
+          
+

Figure 6 shows the format of the layer index field for H.264 SVC streams. This is designed to follow the same layout as the third and fourth bytes of the H.264 SVC NAL unit extension, which carry the stream's layer information. The "R" and "RES" fields MUST be set to 0 on transmission and ignored on reception. See [RFC6190] Section 1.1.3 for details on the DID, QID, and TID fields.

+

A dependency or quality layer refresh of a given layer in H.264 SVC can be identified by the "I" bit (idr_flag) in the extended NAL unit header, present in NAL unit types 14 (prefix NAL unit) and 20 (coded scalable slice). Layer refresh of the base layer can also be identified by its NAL unit type of its coded slices, which is "5" rather than "1". A dependency or quality layer refresh is complete once this bit has been seen on all the appropriate layers (in decoding order) above the current layer index (if any, or beginning from the base layer if not) through the target layer index.

+

Note that as the "I" bit in a PACSI header is set if the corresponding bit is set in any of the aggregated NAL units it describes; thus, it is not sufficient to identify layer refresh when NAL units of multiple dependency or quality layers are aggregated.

+

In H.264 SVC, temporal layer refresh information can be determined from various Supplemental Encoding Information (SEI) messages in the bitstream.

+

Whether an H.264 SVC stream is scalably nested can be determined from the Scalability Information SEI message's temporal_id_nesting flag. If this flag is set in a stream's currently applicable Scalability Information SEI, receivers SHOULD NOT send temporal LRR messages for that stream, as every frame is implicitly a temporal layer refresh point. (The Scalability Information SEI message may also be available in the signaling negotiation of H.264 SVC, as the sprop-scalability-info parameter.)

+

If a stream's temporal_id_nesting flag is not set, the Temporal Level Switching Point SEI message identifies temporal layer switching points. A temporal layer refresh is satisfied when this SEI message is present in a frame with the target layer index, if the message's delta_frame_num refer to a frame with the requested current layer index. (Alternately, temporal layer refresh can also be satisfied by a complete state refresh, such as an IDR.) Senders which support receiving LRR for non-scalably-nested streams MUST insert Temporal Level Switching Point SEI messages as appropriate.

+

+4.2. VP8

+

The VP8 RTP payload format [I-D.ietf-payload-vp8] defines temporal scalability modes. It does not support spatial scalability.

+
+
+
+            +---------------+---------------+
+            |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7|
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |TID| RES                       |
+            +---------------+---------------+
+          
+

Figure 7 shows the format of the layer index field for VP8 streams. The "RES" fields MUST be set to 0 on transmission and ingnored on reception. See [I-D.ietf-payload-vp8] Section 4.2 for details on the TID field.

+

TODO: identifying layer refresh frames in an VP8 bitstream. Is the "Y" bit sufficient? Or is VP8 required to always be temporally nested, leaving this unnecessary?

+

+4.3. H265

+

The initial version of the H.265 payload format [I-D.ietf-payload-rtp-h265] defines temporal scalability, with protocol elements reserved for spatial or other scalability modes (which are expected to be defined in a future version of the specification).

+
+
+
+            +---------------+---------------+
+            |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7|
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            | RES         |  LayerId  | TID |
+            +-------------+-----------------+
+          
+

Figure 8 shows the format of the layer index field for H.265 streams. This is designed to follow the same layout as the first and second bytes of the H.265 NAL unit header, which carry the stream's layer information. The "RES" field MUST be set to 0 on transmission and ignored on reception. See [I-D.ietf-payload-rtp-h265] Section 1.1.4 for details on the LayerId and TID fields.

+

H.265 streams signal whether they are temporally nested, using the vps_temporal_id_nesting_flag in the Video Parameter Set (VPS), and the sps_temporal_id_nesting_flag in the Sequence Parameter Set (SPS). If this flag is set in a stream's currently applicable VPS or SPS, receivers SHOULD NOT send temporal LRR messages for that stream, as every frame is implicitly a temporal layer refresh point.

+

If a stream's sps_temporal_id_nesting_flag is not set, the NAL unit types 2 to 5 inclusively identify temporal layer switching points. A layer refresh to any higher target temporal layer is satisfied when a NAL unit type of 4 or 5 with TID equal to 1 more than current TID is seen. Alternatively, layer refresh to a target temporal layer can be incrementally satisfied with NAL unit type of 2 or 3. In this case, given current TID = TO and target TID = TN, layer refresh to TN is satisfied when NAL unit type of 2 or 3 is seen for TID = T1, then TID = T2, all the way up to TID = TN. During this incremental process, layer referesh to TN can be completely satisfied as soon as a NAL unit type of 2 or 3 is seen.

+

Of course, temporal layer refresh can also be satisfied whenever any Intra Random Access Point (IRAP) NAL unit type (with values 16-23, inclusively) is seen. An IRAP picture is similar to an IDR picture in H.264 (NAL unit type of 5 in H.264) where decoding of the picture can start without any older pictures.

+

In the (future) H.265 payloads that support spatial scalability, a spatial layer refresh of a specific layer can be identified by NAL units with the requested layer ID and NAL unit types between 16 and 21 inclusive. A dependency or quality layer refresh is complete once NAL units of this type have been seen on all the appropriate layers (in decoding order) above the current layer index (if any, or beginning from the base layer if not) through the target layer index.

+

+4.4. VP9

+

The RTP payload format for VP9 [I-D.uberti-payload-vp9] defines how it can be used for spatial and temporal scalability.

+
+
+
+            +---------------+---------------+
+            |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7|
+            +-------------+-----------------+
+            |  T  |R|  S  | RES             |
+            +-------------+-----------------+
+          
+

Figure 9 shows the format of the layer index field for VP9 streams. This is designed to follow the same layout as the "L" byte of the VP9 payload header, which carries the stream's layer information. The "R" and "RES" fields MUST be set to 0 on transmission and ingnored on reception. See [I-D.uberti-payload-vp9] for details on the T and S fields.

+

Identification of a layer refresh frame can be derived from the reference IDs of each frame by backtracking the dependency chain until reaching a point where only decodable frames are being referenced. Therefore it's recommended for both the flexible and the non-flexible mode that, when upgrade frames are being encoded in response to a LRR, those packets should contain layer indices and the reference fields so that the decoder or an MCU can make this derivation.

+

Example:

+

LRR {1,0}, {2,1} is sent by an MCU when it is currently relaying {1,0} to a receiver and which wants to upgrade to {2,1}. In response the encoder should encode the next frames in layers {1,1} and {2,1} by only referring to frames in {1,0}, or {0,0}.

+

In the non-flexible mode, periodic upgrade frames can be defined by the layer structure of the SS, thus periodic upgrade frames can be automatically identified by the picture ID.

+

+5. Usage with different scalability transmission mechanisms

+

Several different mechanisms are defined for how scalable streams can be transmitted in RTP. The RTP Taxonomy [I-D.ietf-avtext-rtp-grouping-taxonomy] Section 3.7 defines three mechanisms: Single RTP Stream on a Single Media Transport (SRST), Multiple RTP Streams on a Single Media Transport (MRST), and Multiple RTP Streams on Multiple Media Transports (MRMT).

+

The LRR message is applicable to all these mechanisms. For MRST and MRMT mechanisms, the "media source" field of the LRR FCI is set to the SSRC of the RTP stream containing the layer indicated by the Current Layer Index (if "C" is 1), or the stream containing the base encoded stream (if "C" is 0). For MRMT, it is sent on the RTP session on which this stream is sent. On receipt, the sender MUST refresh all the layers requested in the stream, simultaneously in decode order.

+

Note: arguably, for the MRST and MRMT mechanisms, FIR feedback messages could instead be used to refresh specific individual layers. However, the usage of FIR for MRSR/MRMT is not explicitly specified anywhere, and if FIR is interpreted as refreshing layers, there is no way to request an actual full, synchronized refresh of all the layers of an MRST/MRMT layered source. Thus, the authors feel that interpreting FIR as refreshing the entire source, and using LRR for the individual layers, would be more useful.

+

+6. Security Considerations +

+

All the security considerations of FIR feedback packets [RFC5104] apply to LRR feedback packets as well. Additionally, media senders receiving LRR feedback packets MUST validate that the payload types and layer indices they are receiving are valid for the stream they are currently sending, and discard the requests if not.

+

+7. IANA Considerations +

+

The IANA is requested to register the following values:
- TODO: PSFB value for LRR

+

+8. References

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
[I-D.ietf-payload-vp8] +Westin, P, Lundin, H, Glover, M, Uberti, J and F Galligan, "RTP Payload Format for VP8 Video", Internet-Draft draft-ietf-payload-vp8-16, June 2015.
[I-D.uberti-payload-vp9] +Uberti, J, Holmer, S, Flodman, M, Lennox, J and D Hong, "RTP Payload Format for VP9 Video", Internet-Draft draft-uberti-payload-vp9-01, March 2015.
[I-D.ietf-payload-rtp-h265] +Wang, Y, Sanchez, Y, Schierl, T, Wenger, S and M Hannuksela, "RTP Payload Format for High Efficiency Video Coding", Internet-Draft draft-ietf-payload-rtp-h265-13, June 2015.
[I-D.ietf-avtext-rtp-grouping-taxonomy] +Lennox, J, Gross, K, Nandakumar, S, Salgueiro, G and B Burman, "A Taxonomy of Semantics and Mechanisms for Real-Time Transport Protocol (RTP) Sources", Internet-Draft draft-ietf-avtext-rtp-grouping-taxonomy-08, July 2015.
[RFC2119] +Bradner, S., "Key words for use in RFCs to Indicate Requirement Levels", BCP 14, RFC 2119, DOI 10.17487/RFC2119, March 1997.
[RFC4585] +Ott, J., Wenger, S., Sato, N., Burmeister, C. and J. Rey, "Extended RTP Profile for Real-time Transport Control Protocol (RTCP)-Based Feedback (RTP/AVPF)", RFC 4585, DOI 10.17487/RFC4585, July 2006.
[RFC5104] +Wenger, S., Chandra, U., Westerlund, M. and B. Burman, "Codec Control Messages in the RTP Audio-Visual Profile with Feedback (AVPF)", RFC 5104, DOI 10.17487/RFC5104, February 2008.
[RFC6190] +Wenger, S., Wang, Y.-K., Schierl, T. and A. Eleftheriadis, "RTP Payload Format for Scalable Video Coding", RFC 6190, DOI 10.17487/RFC6190, May 2011.
+

Authors' Addresses

+
+
+ + Jonathan Lennox + + + Vidyo, Inc. + + 433 Hackensack Avenue +Seventh Floor + + + Hackensack, + NJ + 07601 + + US + + EMail: jonathan@vidyo.com + +
+
+
+ + Danny Hong + + + Vidyo, Inc. + + 433 Hackensack Avenue +Seventh Floor + + + Hackensack, + NJ + 07601 + + US + + EMail: danny@vidyo.com + +
+
+
+ + Justin Uberti + + + Google, Inc. + + 747 6th Street South + + + Kirkland, + WA + 98033 + + USA + + EMail: justin@uberti.name + +
+
+
+ + Stefan Holmer + + + Google, Inc. + + Kungsbron 2 + + + Stockholm, + + 111 22 + + Sweden + + EMail: holmer@google.com + +
+
+
+ + Magnus Flodman + + + Google, Inc. + + Kungsbron 2 + + + Stockholm, + + 111 22 + + Sweden + + EMail: mflodman@google.com + +
+
+ + + \ No newline at end of file diff --git a/lrr/draft-ietf-avtext-lrr-00.txt b/lrr/draft-ietf-avtext-lrr-00.txt new file mode 100644 index 0000000..83c059f --- /dev/null +++ b/lrr/draft-ietf-avtext-lrr-00.txt @@ -0,0 +1,648 @@ + + + +Payload Working Group J. Lennox +Internet-Draft D. Hong +Intended status: Standards Track Vidyo +Expires: February 11, 2016 J. Uberti + S. Holmer + M. Flodman + Google + August 12, 2015 + + The Layer Refresh Request (LRR) RTCP Feedback Message + draft-ietf-avtext-lrr-00 + +Abstract + + This memo describes the RTP Payload-Specific Feedback Message "Layer + Refresh Request" (LRR), which can be used to request a state refresh + of one or more substreams of a layered media stream. It also defines + its use with several scalable media formats. + +Status of this Memo + + This Internet-Draft is submitted in full conformance with the + provisions of BCP 78 and BCP 79. + + Internet-Drafts are working documents of the Internet Engineering + Task Force (IETF). Note that other groups may also distribute + working documents as Internet-Drafts. The list of current Internet- + Drafts is at http://datatracker.ietf.org/drafts/current/. + + Internet-Drafts are draft documents valid for a maximum of six months + and may be updated, replaced, or obsoleted by other documents at any + time. It is inappropriate to use Internet-Drafts as reference + material or to cite them other than as "work in progress." + + This Internet-Draft will expire on February 11, 2016. + +Copyright Notice + + Copyright (c) 2015 IETF Trust and the persons identified as the + document authors. All rights reserved. + + This document is subject to BCP 78 and the IETF Trust's Legal + Provisions Relating to IETF Documents (http://trustee.ietf.org/ + license-info) in effect on the date of publication of this document. + Please review these documents carefully, as they describe your rights + and restrictions with respect to this document. Code Components + extracted from this document must include Simplified BSD License text + as described in Section 4.e of the Trust Legal Provisions and are + provided without warranty as described in the Simplified BSD License. + +Table of Contents + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 1] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 2 + 2. Conventions, Definitions and Acronyms . . . . . . . . . . . . 2 + 2.1. Terminology . . . . . . . . . . . . . . . . . . . . . . . 2 + 3. Layer Refresh Request . . . . . . . . . . . . . . . . . . . . 4 + 3.1. Message Format . . . . . . . . . . . . . . . . . . . . . . 4 + 4. Usage with specific codecs . . . . . . . . . . . . . . . . . . 5 + 4.1. H264 SVC . . . . . . . . . . . . . . . . . . . . . . . . . 5 + 4.2. VP8 . . . . . . . . . . . . . . . . . . . . . . . . . . . 6 + 4.3. H265 . . . . . . . . . . . . . . . . . . . . . . . . . . . 7 + 4.4. VP9 . . . . . . . . . . . . . . . . . . . . . . . . . . . 8 + 5. Usage with different scalability transmission mechanisms . . . 9 + 6. Security Considerations . . . . . . . . . . . . . . . . . . . 9 + 7. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 9 + 8. References . . . . . . . . . . . . . . . . . . . . . . . . . . 9 + Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . . 10 + +1. Introduction + + This memo describes an RTP Payload-Specific Feedback Message + [RFC4585] "Layer Refresh Request" (LRR). It is designed to allow a + receiver of a layered media stream to request that one or more of its + substreams be refreshed, such that it can then be decoded by an + endpoint which previously was not receiving those layers, without + requiring that the entire stream be refreshed (as it would be if the + receiver sent a Full Intra Request (FIR) [RFC5104]. + + The message is designed to be applicable both to temporally and + spatially scaled streams, and to both single-stream and multi-stream + scalability modes. + +2. Conventions, Definitions and Acronyms + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in [RFC2119]. + +2.1. Terminology + + A "Layer Refresh Point" is a point in a scalable stream after which a + decoder, which previously had been able to decode only some (possibly + none) of the available layers of stream, is able to decode a greater + number of the layers. + + For spatial (or quality) layers, layer refresh typically requires + that a spatial layer be encoded in a way that references only lower- + layer subpictures of the current picture, not any earlier pictures of + that spatial layer. Additionally, the encoder must promise that no + earlier pictures of that spatial layer will be used as reference in + the future. + + In a layer refresh, however, other layers than the ones requested for + refresh may still maintain dependency on earlier content of the + stream. This is the difference between a layer refresh and a Full + + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 2] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + Intra Request [RFC5104]. This minimizes the coding overhead of + refresh to only those parts of the stream that actually need to be + refreshed at any given time. + + An illustration of spatial layer refresh of an enhancement layer is + shown below. + + ... <-- S1 <-- S1 S1 <-- S1 <-- ... + | | | | + \/ \/ \/ \/ + ... <-- S0 <-- S0 <-- S0 <-- S0 <-- ... + + 1 2 3 4 + + In this illustration, frame 3 is a layer refresh point for spatial + layer S1; a decoder which had previously only been decoding spatial + layer S0 would be able to decode layer S1 starting at frame 3. + + An illustration of spatial layer refresh of a base layer is shown + below. + + ... <-- S1 <-- S1 <-- S1 <-- S1 <-- ... + | | | | + \/ \/ \/ \/ + ... <-- S0 <-- S0 S0 <-- S0 <-- ... + + 1 2 3 4 + + In this illustration, frame 3 is a layer refresh point for spatial + layer S0; a decoder which had previously not been decoding the stream + at all could decode layer S0 starting at frame 3. + + For temporal layers, layer refresh requires that the layer be + "temporally nested", i.e. use as reference only earlier frames of a + lower temporal layer, not any earlier frames of this temporal layer, + and also promise that no future frames of this temporal layer will + reference frames of this temporal layer before the refresh point. In + many cases, the temporal structure of the stream will mean that all + frames are temporally nested, in which case decoders will have no + need to send LRR messages for the stream. + + An illustration of temporal layer refresh is shown below. + + ... <----- T1 <------ T1 T1 <------ ... + / / / + |_ |_ |_ + ... <-- T0 <------ T0 <------ T0 <------ T0 <--- ... + + + + + + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 3] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + + 1 2 3 4 5 6 7 + + In this illustration, frame 6 is a layer refresh point for temporal + layer T1; a decoder which had previously only been decoding temporal + layer T0 would be able to decode layer T1 starting at frame 6. + + An illustration of an inherently temporally nested stream is shown + below. + + T1 T1 T1 + / / / + |_ |_ |_ + ... <-- T0 <------ T0 <------ T0 <------ T0 <--- ... + + 1 2 3 4 5 6 7 + + In this illustration, the stream is temporally nested in its ordinary + structure; a decoder receiving layer T0 can begin decoding layer T1 + at any point. + +3. Layer Refresh Request + + A layer refresh frame can be requested by sending a Layer Refresh + Request (LRR), which is an RTCP payload-specific feedback message + [RFC4585] asking the encoder to encode a frame which makes it + possible to upgrade to a higher layer. The LRR contains one or two + tuples, indicating the layer the decoder wants to upgrade to, and + (optionally) the currently highest layer the decoder can decode. + + The specific format of the tuples, and the mechanism by which a + receiver recognizes a refresh frame, is codec-dependent. Usage for + several codecs is discussed in Section 4. + + LRR follows the model of the Full Intra Request (FIR) + [RFC5104](Section 3.5.1) for its retransmission, reliability, and use + in multipoint conferences. TODO: expand these here. + + The LRR message is identified by RTCP packet type value PT=PSFB and + FMT=TBD. The FCI field MUST contain one or more FIR entries. Each + entry applies to a different media sender, identified by its SSRC. + +3.1. Message Format + + The Feedback Control Information (FCI) for the Layer Refresh Request + consists of one or more FCI entries, the content of which is depicted + in Figure 5. The length of the LRR feedback message MUST be set to + 2+3*N, where N is the number of FCI entries. + + + + + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 4] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | SSRC | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Seq nr. |C| Payload Type| Reserved | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Target Layer Index | Current Layer Index (opt) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + SSRC (32 bits) The SSRC value of the media sender that is requested + to send a layer refresh point. + + Seq nr. (8 bits) Command sequence number. The sequence number space + is unique for each pairing of the SSRC of command source and the + SSRC of the command target. The sequence number SHALL be + increased by 1 modulo 256 for each new command. A repetition + SHALL NOT increase the sequence number. The initial value is + arbitrary. + + C (1 bit) A flag bit indicating whether the "Current Layer Index" + field is present in the FCI. If this bit is false, the sender of + the LRR message is requesting refresh of all layers up to and + including the target layer. + + Payload Type (7 bits) The RTP payload type for which the LRR is being + requested. This gives the context in which the target layer index + is to be interpreted. + + Reserved (16 bits) All bits SHALL be set to 0 by the sender and SHALL + be ignored on reception. + + Target Layer Index (16 bits) The target layer for which the receiver + wishes a refresh point. Its format is dependent on the payload + type field. + + Current Layer Index (16 bits) If C is 1, the current layer being + decoded by the receiver. This message is not requesting refresh + of layers at or below this layer. If C is 0, this field SHALL be + set to 0 by the sender and SHALL be ignored on reception. + +4. Usage with specific codecs + +4.1. H264 SVC + + H.264 SVC [RFC6190] defines temporal, dependency (spatial), and + quality scalability modes. + + + + + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 5] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + +---------------+---------------+ + |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |R| DID | QID | TID |RES | + +---------------+---------------+ + + Figure 6 shows the format of the layer index field for H.264 SVC + streams. This is designed to follow the same layout as the third and + fourth bytes of the H.264 SVC NAL unit extension, which carry the + stream's layer information. The "R" and "RES" fields MUST be set to + 0 on transmission and ignored on reception. See [RFC6190] Section + 1.1.3 for details on the DID, QID, and TID fields. + + A dependency or quality layer refresh of a given layer in H.264 SVC + can be identified by the "I" bit (idr_flag) in the extended NAL unit + header, present in NAL unit types 14 (prefix NAL unit) and 20 (coded + scalable slice). Layer refresh of the base layer can also be + identified by its NAL unit type of its coded slices, which is "5" + rather than "1". A dependency or quality layer refresh is complete + once this bit has been seen on all the appropriate layers (in + decoding order) above the current layer index (if any, or beginning + from the base layer if not) through the target layer index. + + Note that as the "I" bit in a PACSI header is set if the + corresponding bit is set in any of the aggregated NAL units it + describes; thus, it is not sufficient to identify layer refresh when + NAL units of multiple dependency or quality layers are aggregated. + + In H.264 SVC, temporal layer refresh information can be determined + from various Supplemental Encoding Information (SEI) messages in the + bitstream. + + Whether an H.264 SVC stream is scalably nested can be determined from + the Scalability Information SEI message's temporal_id_nesting flag. + If this flag is set in a stream's currently applicable Scalability + Information SEI, receivers SHOULD NOT send temporal LRR messages for + that stream, as every frame is implicitly a temporal layer refresh + point. (The Scalability Information SEI message may also be + available in the signaling negotiation of H.264 SVC, as the sprop- + scalability-info parameter.) + + If a stream's temporal_id_nesting flag is not set, the Temporal Level + Switching Point SEI message identifies temporal layer switching + points. A temporal layer refresh is satisfied when this SEI message + is present in a frame with the target layer index, if the message's + delta_frame_num refer to a frame with the requested current layer + index. (Alternately, temporal layer refresh can also be satisfied by + a complete state refresh, such as an IDR.) Senders which support + receiving LRR for non-scalably-nested streams MUST insert Temporal + Level Switching Point SEI messages as appropriate. + +4.2. VP8 + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 6] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + + The VP8 RTP payload format [I-D.ietf-payload-vp8] defines temporal + scalability modes. It does not support spatial scalability. + + +---------------+---------------+ + |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |TID| RES | + +---------------+---------------+ + + Figure 7 shows the format of the layer index field for VP8 streams. + The "RES" fields MUST be set to 0 on transmission and ingnored on + reception. See [I-D.ietf-payload-vp8] Section 4.2 for details on the + TID field. + + TODO: identifying layer refresh frames in an VP8 bitstream. Is the + "Y" bit sufficient? Or is VP8 required to always be temporally + nested, leaving this unnecessary? + +4.3. H265 + + The initial version of the H.265 payload format [I-D.ietf-payload- + rtp-h265] defines temporal scalability, with protocol elements + reserved for spatial or other scalability modes (which are expected + to be defined in a future version of the specification). + + +---------------+---------------+ + |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | RES | LayerId | TID | + +-------------+-----------------+ + + Figure 8 shows the format of the layer index field for H.265 streams. + This is designed to follow the same layout as the first and second + bytes of the H.265 NAL unit header, which carry the stream's layer + information. The "RES" field MUST be set to 0 on transmission and + ignored on reception. See [I-D.ietf-payload-rtp-h265] Section 1.1.4 + for details on the LayerId and TID fields. + + H.265 streams signal whether they are temporally nested, using the + vps_temporal_id_nesting_flag in the Video Parameter Set (VPS), and + the sps_temporal_id_nesting_flag in the Sequence Parameter Set (SPS). + If this flag is set in a stream's currently applicable VPS or SPS, + receivers SHOULD NOT send temporal LRR messages for that stream, as + every frame is implicitly a temporal layer refresh point. + + + + + + + + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 7] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + + If a stream's sps_temporal_id_nesting_flag is not set, the NAL unit + types 2 to 5 inclusively identify temporal layer switching points. A + layer refresh to any higher target temporal layer is satisfied when a + NAL unit type of 4 or 5 with TID equal to 1 more than current TID is + seen. Alternatively, layer refresh to a target temporal layer can be + incrementally satisfied with NAL unit type of 2 or 3. In this case, + given current TID = TO and target TID = TN, layer refresh to TN is + satisfied when NAL unit type of 2 or 3 is seen for TID = T1, then TID + = T2, all the way up to TID = TN. During this incremental process, + layer referesh to TN can be completely satisfied as soon as a NAL + unit type of 2 or 3 is seen. + + Of course, temporal layer refresh can also be satisfied whenever any + Intra Random Access Point (IRAP) NAL unit type (with values 16-23, + inclusively) is seen. An IRAP picture is similar to an IDR picture + in H.264 (NAL unit type of 5 in H.264) where decoding of the picture + can start without any older pictures. + + In the (future) H.265 payloads that support spatial scalability, a + spatial layer refresh of a specific layer can be identified by NAL + units with the requested layer ID and NAL unit types between 16 and + 21 inclusive. A dependency or quality layer refresh is complete once + NAL units of this type have been seen on all the appropriate layers + (in decoding order) above the current layer index (if any, or + beginning from the base layer if not) through the target layer index. + +4.4. VP9 + + The RTP payload format for VP9 [I-D.uberti-payload-vp9] defines how + it can be used for spatial and temporal scalability. + + +---------------+---------------+ + |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| + +-------------+-----------------+ + | T |R| S | RES | + +-------------+-----------------+ + + Figure 9 shows the format of the layer index field for VP9 streams. + This is designed to follow the same layout as the "L" byte of the VP9 + payload header, which carries the stream's layer information. The + "R" and "RES" fields MUST be set to 0 on transmission and ingnored on + reception. See [I-D.uberti-payload-vp9] for details on the T and S + fields. + + Identification of a layer refresh frame can be derived from the + reference IDs of each frame by backtracking the dependency chain + until reaching a point where only decodable frames are being + referenced. Therefore it's recommended for both the flexible and the + non-flexible mode that, when upgrade frames are being encoded in + response to a LRR, those packets should contain layer indices and the + reference fields so that the decoder or an MCU can make this + derivation. + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 8] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + + Example: + + LRR {1,0}, {2,1} is sent by an MCU when it is currently relaying + {1,0} to a receiver and which wants to upgrade to {2,1}. In response + the encoder should encode the next frames in layers {1,1} and {2,1} + by only referring to frames in {1,0}, or {0,0}. + + In the non-flexible mode, periodic upgrade frames can be defined by + the layer structure of the SS, thus periodic upgrade frames can be + automatically identified by the picture ID. + +5. Usage with different scalability transmission mechanisms + + Several different mechanisms are defined for how scalable streams can + be transmitted in RTP. The RTP Taxonomy [I-D.ietf-avtext-rtp- + grouping-taxonomy] Section 3.7 defines three mechanisms: Single RTP + Stream on a Single Media Transport (SRST), Multiple RTP Streams on a + Single Media Transport (MRST), and Multiple RTP Streams on Multiple + Media Transports (MRMT). + + The LRR message is applicable to all these mechanisms. For MRST and + MRMT mechanisms, the "media source" field of the LRR FCI is set to + the SSRC of the RTP stream containing the layer indicated by the + Current Layer Index (if "C" is 1), or the stream containing the base + encoded stream (if "C" is 0). For MRMT, it is sent on the RTP + session on which this stream is sent. On receipt, the sender MUST + refresh all the layers requested in the stream, simultaneously in + decode order. + + Note: arguably, for the MRST and MRMT mechanisms, FIR feedback + messages could instead be used to refresh specific individual layers. + However, the usage of FIR for MRSR/MRMT is not explicitly specified + anywhere, and if FIR is interpreted as refreshing layers, there is no + way to request an actual full, synchronized refresh of all the layers + of an MRST/MRMT layered source. Thus, the authors feel that + interpreting FIR as refreshing the entire source, and using LRR for + the individual layers, would be more useful. + +6. Security Considerations + + All the security considerations of FIR feedback packets [RFC5104] + apply to LRR feedback packets as well. Additionally, media senders + receiving LRR feedback packets MUST validate that the payload types + and layer indices they are receiving are valid for the stream they + are currently sending, and discard the requests if not. + +7. IANA Considerations + + The IANA is requested to register the following values: + - TODO: PSFB value for LRR + +8. References + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 9] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + + [I-D.ietf-avtext-rtp-grouping-taxonomy] + Lennox, J., Gross, K., Nandakumar, S., Salgueiro, G. and + B. Burman, "A Taxonomy of Semantics and Mechanisms for + Real-Time Transport Protocol (RTP) Sources", Internet- + Draft draft-ietf-avtext-rtp-grouping-taxonomy-08, July + 2015. + + [I-D.ietf-payload-rtp-h265] + Wang, Y., Sanchez, Y., Schierl, T., Wenger, S. and M. + Hannuksela, "RTP Payload Format for High Efficiency Video + Coding", Internet-Draft draft-ietf-payload-rtp-h265-13, + June 2015. + + [I-D.ietf-payload-vp8] + Westin, P., Lundin, H., Glover, M., Uberti, J. and F. + Galligan, "RTP Payload Format for VP8 Video", Internet- + Draft draft-ietf-payload-vp8-16, June 2015. + + [I-D.uberti-payload-vp9] + Uberti, J., Holmer, S., Flodman, M., Lennox, J. and D. + Hong, "RTP Payload Format for VP9 Video", Internet-Draft + draft-uberti-payload-vp9-01, March 2015. + + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate + Requirement Levels", BCP 14, RFC 2119, DOI 10.17487/ + RFC2119, March 1997, . + + [RFC4585] Ott, J., Wenger, S., Sato, N., Burmeister, C. and J. Rey, + "Extended RTP Profile for Real-time Transport Control + Protocol (RTCP)-Based Feedback (RTP/AVPF)", RFC 4585, DOI + 10.17487/RFC4585, July 2006, . + + [RFC5104] Wenger, S., Chandra, U., Westerlund, M. and B. Burman, + "Codec Control Messages in the RTP Audio-Visual Profile + with Feedback (AVPF)", RFC 5104, DOI 10.17487/RFC5104, + February 2008, . + + [RFC6190] Wenger, S., Wang, Y.-K., Schierl, T. and A. Eleftheriadis, + "RTP Payload Format for Scalable Video Coding", RFC 6190, + DOI 10.17487/RFC6190, May 2011, . + +Authors' Addresses + + + + + + + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 10] + +Internet-Draft Layer Refresh Request RTCP Feedback August 2015 + + + Jonathan Lennox + Vidyo, Inc. + 433 Hackensack Avenue + Seventh Floor + Hackensack, NJ 07601 + US + + Email: jonathan@vidyo.com + + + Danny Hong + Vidyo, Inc. + 433 Hackensack Avenue + Seventh Floor + Hackensack, NJ 07601 + US + + Email: danny@vidyo.com + + + Justin Uberti + Google, Inc. + 747 6th Street South + Kirkland, WA 98033 + USA + + Email: justin@uberti.name + + + Stefan Holmer + Google, Inc. + Kungsbron 2 + Stockholm, 111 22 + Sweden + + Email: holmer@google.com + + + Magnus Flodman + Google, Inc. + Kungsbron 2 + Stockholm, 111 22 + Sweden + + Email: mflodman@google.com + + + + + + + + +Lennox, Hong, Uberti, HExpires February 11, 2016 [Page 11] diff --git a/lrr/draft-ietf-avtext-lrr-00.xml b/lrr/draft-ietf-avtext-lrr-00.xml new file mode 100644 index 0000000..ae9cf6a --- /dev/null +++ b/lrr/draft-ietf-avtext-lrr-00.xml @@ -0,0 +1,651 @@ + + + + + + + + + + + + + + + + + +]> + + + + + + + + + + + + + + + + + + + + + The Layer Refresh Request (LRR) RTCP Feedback Message + + + Vidyo, Inc. + +
+ + 433 Hackensack Avenue + + Seventh Floor + + Hackensack + + NJ + + 07601 + + US + + + jonathan@vidyo.com +
+
+ + + Vidyo, Inc. + +
+ + 433 Hackensack Avenue + + Seventh Floor + + Hackensack + + NJ + + 07601 + + US + + + danny@vidyo.com +
+
+ + + Google, Inc. + +
+ + 747 6th Street South + + Kirkland + + WA + + 98033 + + USA + + justin@uberti.name +
+
+ + + Google, Inc. +
+ + Kungsbron 2 + + 111 22 + + Stockholm + + Sweden + + holmer@google.com +
+
+ + + Google, Inc. +
+ + Kungsbron 2 + + 111 22 + + Stockholm + + Sweden + + mflodman@google.com +
+
+ + + + RAI + + Payload Working Group + + RFC + + Request for Comments + + RTP + + + This memo describes the RTP Payload-Specific Feedback Message + "Layer Refresh Request" (LRR), which can be used to request a + state refresh of one or more substreams of a layered media + stream. It also defines its use with several scalable media + formats. + +
+ + +
+ This memo describes an RTP Payload-Specific Feedback Message + "Layer Refresh Request" (LRR). It is designed to allow a + receiver of a layered media stream to request that one or more + of its substreams be refreshed, such that it can then be + decoded by an endpoint which previously was not receiving those + layers, without requiring that the + entire stream be refreshed (as it would be if the receiver + sent a Full Intra Request (FIR). + + The message is designed to be applicable both to temporally + and spatially scaled streams, and to both single-stream and + multi-stream scalability modes. + +
+ +
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in . + + +
+ A "Layer Refresh Point" is a point in a scalable stream after + which a decoder, which previously had been able to decode only some + (possibly none) of the available layers of stream, is able to + decode a greater number of the layers. + + For spatial (or quality) layers, layer refresh typically + requires that a spatial layer be encoded in a way that + references only lower-layer subpictures of the current picture, + not any earlier pictures of that spatial layer. Additionally, + the encoder must promise that no earlier pictures of that + spatial layer will be used as reference in the future. + + In a layer refresh, however, other layers than the ones + requested for refresh may still maintain dependency on earlier + content of the stream. This is the difference between a layer + refresh and a Full Intra + Request. This minimizes the coding overhead of refresh + to only those parts of the stream that actually need to be + refreshed at any given time. + +
+ An illustration of spatial layer refresh of an + enhancement layer is shown below. + + + In this illustration, frame 3 is a layer refresh + point for spatial layer S1; a decoder which had previously + only been decoding spatial layer S0 would be able to + decode layer S1 starting at frame 3. +
+ +
+ An illustration of spatial layer refresh of a base layer is shown + below. + + + In this illustration, frame 3 is a layer refresh + point for spatial layer S0; a decoder which had previously + not been decoding the stream at all could decode layer S0 + starting at frame 3. +
+ + For temporal layers, layer refresh requires that the layer be + "temporally nested", i.e. use as reference only + earlier frames of a lower temporal layer, not any earlier frames of this + temporal layer, and also promise that no future frames + of this temporal layer will reference frames of this temporal + layer before the refresh point. In many cases, the temporal + structure of the stream will mean that all frames are + temporally nested, in which case decoders will have no need to + send LRR messages for the stream. + +
+ An illustration of temporal layer refresh is shown + below. + + + In this illustration, frame 6 is a layer refresh + point for temporal layer T1; a decoder which had previously + only been decoding temporal layer T0 would be able to + decode layer T1 starting at frame 6. +
+ +
+ An illustration of an inherently temporally nested + stream is shown below. + + + In this illustration, the stream is temporally + nested in its ordinary structure; a decoder receiving layer + T0 can begin decoding layer T1 at any point. +
+ +
+ +
+ +
+ A layer refresh frame can be requested by sending a Layer Refresh Request (LRR), + which is an RTCP payload-specific feedback message asking the encoder to encode a frame + which makes it possible to upgrade to a higher layer. The LRR + contains one or two tuples, indicating the layer the decoder + wants to upgrade to, and (optionally) the currently highest + layer the decoder can decode. + + The specific format of the tuples, and the mechanism by which + a receiver recognizes a refresh frame, is + codec-dependent. Usage for several codecs is discussed in + . + + LRR follows the model of the Full + Intra Request (FIR)(Section 3.5.1) for its + retransmission, reliability, and use in multipoint conferences. + TODO: expand these here. + + The LRR message is identified by RTCP packet type value + PT=PSFB and FMT=TBD. The FCI field MUST contain one or more FIR entries. Each entry + applies to a different media sender, identified by its SSRC. + +
+ + The Feedback Control Information (FCI) for the Layer Refresh Request + consists of one or more FCI entries, the content of which is + depicted in . The length of + the LRR feedback message MUST be set to + 2+3*N, where N is the number of FCI entries. + +
+ +
+ + + + The SSRC value of the media sender that is + requested to send a layer refresh point. + + Command sequence number. The sequence number + space is unique for each pairing of the SSRC of command + source and the SSRC of the command target. The sequence + number SHALL be increased by 1 modulo 256 for each new + command. A repetition SHALL NOT increase the sequence + number. The initial value is arbitrary. + + A flag bit indicating whether the + "Current Layer Index" field is present in the FCI. If + this bit is false, the sender of the LRR message is + requesting refresh of all layers up to and including the + target layer. + + The RTP payload type for + which the LRR is being requested. This gives the context in + which the target layer index is to be interpreted. + + All bits SHALL be set to 0 + by the sender and SHALL be ignored on reception. + + The target layer + for which the receiver wishes a refresh point. Its format + is dependent on the payload type field. + + If C is 1, the + current layer being decoded by the receiver. This message + is not requesting refresh of layers at or below this layer. + If C is 0, this field SHALL be set to 0 by the sender and + SHALL be ignored on reception. + + + +
+
+ +
+ +
+ + H.264 SVC defines temporal, + dependency (spatial), and quality scalability modes. + +
+ +
+ + shows the format + of the layer index field for H.264 SVC streams. This is + designed to follow the same layout as the third and fourth + bytes of the H.264 SVC NAL unit extension, which carry the + stream's layer information. The "R" and "RES" + fields MUST be set to 0 on transmission and ignored on + reception. See Section 1.1.3 for + details on the DID, QID, and TID fields. + + A dependency or quality layer refresh of a given layer in + H.264 SVC can be identified by the "I" bit (idr_flag) in the + extended NAL unit header, present in NAL unit types 14 (prefix + NAL unit) and 20 (coded scalable slice). Layer refresh of the + base layer can also be identified by its NAL unit type of + its coded slices, which is "5" rather than "1". A dependency or + quality layer refresh is complete once this bit has been seen + on all the appropriate layers (in decoding order) above the + current layer index (if any, or beginning from the base layer + if not) through the target layer index. + + Note that as the "I" bit in a PACSI header is set if the + corresponding bit is set in any of the aggregated NAL units it + describes; thus, it is not sufficient to identify layer + refresh when NAL units of multiple dependency or quality layers + are aggregated. + + In H.264 SVC, temporal layer refresh information can be + determined from various Supplemental Encoding Information + (SEI) messages in the bitstream. + + Whether an H.264 SVC stream is scalably nested can be determined from + the Scalability Information SEI message's temporal_id_nesting + flag. If this flag is set in a stream's currently applicable + Scalability Information SEI, receivers SHOULD NOT send + temporal LRR messages for that stream, as every frame is + implicitly a temporal layer refresh point. (The Scalability + Information SEI message may also be available in the signaling + negotiation of H.264 SVC, as the sprop-scalability-info + parameter.) + + If a stream's temporal_id_nesting flag is not set, the + Temporal Level Switching Point SEI message identifies temporal + layer switching points. A temporal layer refresh is satisfied + when this SEI message is present in a frame with the target + layer index, if the message's delta_frame_num refer to a frame + with the requested current layer index. (Alternately, + temporal layer refresh can also be satisfied by a complete + state refresh, such as an IDR.) Senders which support + receiving LRR for non-scalably-nested streams MUST insert + Temporal Level Switching Point SEI messages as appropriate. + +
+ +
+ + The VP8 RTP payload + format defines temporal scalability modes. It does not + support spatial scalability. + +
+ +
+ + shows the format + of the layer index field for VP8 streams. The "RES" + fields MUST be set to 0 on transmission and ingnored on + reception. See Section 4.2 for + details on the TID field. + + TODO: identifying layer refresh frames in an VP8 + bitstream. Is the "Y" bit sufficient? Or is VP8 required to + always be temporally nested, leaving this unnecessary? +
+ +
+ + The initial version + of the H.265 payload + format defines temporal scalability, with protocol + elements reserved for spatial or other scalability modes + (which are expected to be defined in a future version of the + specification). + +
+ +
+ + shows the format + of the layer index field for H.265 streams. This is + designed to follow the same layout as the first and second + bytes of the H.265 NAL unit header, which carry the + stream's layer information. The "RES" + field MUST be set to 0 on transmission and ignored on + reception. See Section 1.1.4 for + details on the LayerId and TID fields. + + H.265 streams signal whether they are temporally nested, + using the vps_temporal_id_nesting_flag in the Video Parameter + Set (VPS), and the sps_temporal_id_nesting_flag in the Sequence + Parameter Set (SPS). If this flag is set in a stream's currently applicable + VPS or SPS, receivers SHOULD NOT send temporal LRR messages + for that stream, as every frame is implicitly a temporal layer + refresh point. + + If a stream's sps_temporal_id_nesting_flag is not set, the + NAL unit types 2 to 5 inclusively identify temporal + layer switching points. A layer refresh to any higher + target temporal layer is satisfied when a NAL unit type of 4 or 5 + with TID equal to 1 more than current TID is seen. Alternatively, + layer refresh to a target temporal layer can be incrementally + satisfied with NAL unit type of 2 or 3. In this case, given + current TID = TO and target TID = TN, layer refresh to TN is satisfied + when NAL unit type of 2 or 3 is seen for TID = T1, then TID = T2, + all the way up to TID = TN. During this incremental process, layer + referesh to TN can be completely satisfied as soon as a NAL unit type + of 2 or 3 is seen. + + Of course, temporal layer refresh can also be satisfied whenever + any Intra Random Access Point (IRAP) NAL unit type (with values 16-23, + inclusively) is seen. An IRAP picture is similar to an IDR picture in + H.264 (NAL unit type of 5 in H.264) where decoding of the picture can start + without any older pictures. + + In the (future) H.265 payloads that support spatial + scalability, a spatial layer refresh of a specific layer can + be identified by NAL units with the requested layer ID and NAL + unit types between 16 and 21 inclusive. A dependency or + quality layer refresh is complete once NAL units of this type have been seen + on all the appropriate layers (in decoding order) above the + current layer index (if any, or beginning from the base layer + if not) through the target layer index. +
+ +
+ The RTP payload format + for VP9 defines how it can be used for spatial and + temporal scalability. + +
+ +
+ + shows the format + of the layer index field for VP9 streams. This is + designed to follow the same layout as the "L" byte + of the VP9 payload header, which carries the + stream's layer information. The "R" and "RES" + fields MUST be set to 0 on transmission and ingnored on + reception. See for + details on the T and S fields. + + Identification of a layer refresh frame can be derived from the + reference IDs of each frame by backtracking the dependency chain + until reaching a point where only decodable frames are being + referenced. Therefore it's recommended for both the + flexible and the non-flexible mode that, when upgrade frames are + being encoded in response to a LRR, those packets should contain + layer indices and the reference fields so that the decoder or an + MCU can make this derivation. + + Example: + LRR {1,0}, {2,1} is sent by an MCU when it is currently + relaying {1,0} to a receiver and which wants to upgrade to + {2,1}. In response the encoder should encode the next frames + in layers {1,1} and {2,1} by only referring to frames in + {1,0}, or {0,0}. + + In the non-flexible mode, periodic upgrade frames can be + defined by the layer structure of the SS, thus periodic upgrade + frames can be automatically identified by the picture ID. +
+ +
+ +
+ + Several different mechanisms are defined for how scalable + streams can be transmitted in RTP. + The RTP + Taxonomy Section 3.7 defines three mechanisms: Single RTP + Stream on a Single Media Transport (SRST), Multiple RTP Streams + on a Single Media Transport (MRST), and Multiple RTP Streams on + Multiple Media Transports (MRMT). + + The LRR message is applicable to all these mechanisms. For + MRST and MRMT mechanisms, the "media source" field of the LRR + FCI is set to the SSRC of the RTP stream containing the layer + indicated by the Current Layer Index (if "C" is 1), or the + stream containing the base encoded stream (if "C" is 0). For + MRMT, it is sent on the RTP session on which this stream is + sent. On receipt, the sender MUST refresh all the layers + requested in the stream, simultaneously in decode order. + + Note: arguably, for the MRST and MRMT mechanisms, FIR + feedback messages could instead be used to refresh specific individual + layers. However, the usage of FIR for MRSR/MRMT is not + explicitly specified anywhere, and if FIR is interpreted as refreshing + layers, there is no way to request an actual full, synchronized refresh of + all the layers of an MRST/MRMT layered source. Thus, the authors feel that + interpreting FIR as refreshing the entire source, and using + LRR for the individual layers, would be more useful. + +
+ +
+ All the security considerations of FIR + feedback packets apply to LRR feedback packets as well. + Additionally, media senders receiving LRR feedback packets MUST + validate that the payload types and layer indices they are + receiving are valid for the stream they are currently sending, + and discard the requests if not. +
+ +
+ The IANA is requested to register the following values: - TODO: PSFB value for LRR +
+
+ + + + &vp8rtp; + + &vp9rtp; + + &h265rtp; + + &taxonomy; + + &rfc2119; + + &rfc4585; + + &rfc5104; + + &rfc6190; + + +
+ + diff --git a/nombis/draft-uberti-mmusic-nombis.html b/nombis/draft-uberti-mmusic-nombis.html deleted file mode 100644 index 0d71acd..0000000 --- a/nombis/draft-uberti-mmusic-nombis.html +++ /dev/null @@ -1,655 +0,0 @@ - - - - - - - - - - - draft-uberti-mmusic-nombis-2.txt - Improvements to ICE Candidate Nomination - - - - - - - - -
-
- -
- -
-
-
-
-Network Working Group                                          J. Uberti
-Internet-Draft                                                    Google
-Intended status: Standards Track                               J. Lennox
-Expires: May 10, 2015                                              Vidyo
-                                                       November 06, 2014
-
-
-                Improvements to ICE Candidate Nomination
-                     draft-uberti-mmusic-nombis-00
-
-Abstract
-
-   This document makes recommendations for simplifying and improving the
-   procedures for candidate nomination in Interactive Connectivity
-   Establishment (ICE).
-
-Status of This Memo
-
-   This Internet-Draft is submitted in full conformance with the
-   provisions of BCP 78 and BCP 79.
-
-   Internet-Drafts are working documents of the Internet Engineering
-   Task Force (IETF).  Note that other groups may also distribute
-   working documents as Internet-Drafts.  The list of current Internet-
-   Drafts is at http://datatracker.ietf.org/drafts/current/.
-
-   Internet-Drafts are draft documents valid for a maximum of six months
-   and may be updated, replaced, or obsoleted by other documents at any
-   time.  It is inappropriate to use Internet-Drafts as reference
-   material or to cite them other than as "work in progress."
-
-   This Internet-Draft will expire on May 10, 2015.
-
-Copyright Notice
-
-   Copyright (c) 2014 IETF Trust and the persons identified as the
-   document authors.  All rights reserved.
-
-   This document is subject to BCP 78 and the IETF Trust's Legal
-   Provisions Relating to IETF Documents
-   (http://trustee.ietf.org/license-info) in effect on the date of
-   publication of this document.  Please review these documents
-   carefully, as they describe your rights and restrictions with respect
-   to this document.  Code Components extracted from this document must
-   include Simplified BSD License text as described in Section 4.e of
-   the Trust Legal Provisions and are provided without warranty as
-   described in the Simplified BSD License.
-
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 1]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-Table of Contents
-
-   1.  Introduction  . . . . . . . . . . . . . . . . . . . . . . . .   2
-   2.  Terminology . . . . . . . . . . . . . . . . . . . . . . . . .   3
-   3.  Goals and Requirements  . . . . . . . . . . . . . . . . . . .   3
-     3.1.  Minimize Call Setup Latency . . . . . . . . . . . . . . .   3
-     3.2.  Allow Controlling Endpoint to Make Dynamic Decisions  . .   3
-     3.3.  Allow Selected Pair Change At Any Time Without Signaling    4
-     3.4.  Allow Continuous Addition of Candidates . . . . . . . . .   4
-     3.5.  Maintain Backwards Compatibility  . . . . . . . . . . . .   4
-     3.6.  Minimize Complexity Increase  . . . . . . . . . . . . . .   5
-   4.  Deprecating Aggressive Nomination . . . . . . . . . . . . . .   5
-     4.1.  Overview  . . . . . . . . . . . . . . . . . . . . . . . .   5
-     4.2.  Operation . . . . . . . . . . . . . . . . . . . . . . . .   5
-     4.3.  Backwards Compatibility . . . . . . . . . . . . . . . . .   6
-   5.  Introducing Continuous Nomination . . . . . . . . . . . . . .   6
-     5.1.  Overview  . . . . . . . . . . . . . . . . . . . . . . . .   6
-     5.2.  Operation . . . . . . . . . . . . . . . . . . . . . . . .   7
-     5.3.  Backwards Compatibility . . . . . . . . . . . . . . . . .   8
-   6.  Examples  . . . . . . . . . . . . . . . . . . . . . . . . . .   8
-   7.  Security Considerations . . . . . . . . . . . . . . . . . . .   8
-   8.  IANA Considerations . . . . . . . . . . . . . . . . . . . . .   8
-   9.  Acknowledgements  . . . . . . . . . . . . . . . . . . . . . .   8
-   10. References  . . . . . . . . . . . . . . . . . . . . . . . . .   8
-     10.1.  Normative References . . . . . . . . . . . . . . . . . .   8
-     10.2.  Informative References . . . . . . . . . . . . . . . . .   9
-   Appendix A.  Change log . . . . . . . . . . . . . . . . . . . . .   9
-   Authors' Addresses  . . . . . . . . . . . . . . . . . . . . . . .   9
-
-1.  Introduction
-
-   Interactive Connectivity Establishment (ICE) attempts to find the
-   'best' path for connectivity between two peers; in ICE parlance,
-   these paths are known as 'candidate pairs'.  During the ICE process,
-   one endpoint, known as the 'controlling' endpoint, selects a
-   candidate pair as the best pair; this action is known as nomination.
-   ICE supports two different mechanisms for performing nomination,
-   known as Regular Nomination, and Aggressive Nomination.
-
-   However, each of these modes have flaws that restrict their
-   usefulness.  Regular Nomination, as currently speced, requires a best
-   pair to be chosen before media transmission can start, causing
-   unnecessary call setup delay.  Aggressive Nomination, while avoiding
-   this delay, gives the controlling endpoint much less discretion into
-   which candidate pair is chosen, preventing it from making decisions
-   based on dynamic factors such as RTT or loss rate.  Needless to say,
-   the presence of both modes also adds nontrivial complexity.
-
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 2]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-   Lastly, ICE is currently defined as a finite process, where the
-   decision on the optimal candidate pair is made during call setup and
-   infrequently (if ever) changed.  While this may be acceptable for
-   endpoints with static network configurations, it fails to meet the
-   needs of mobile endpoints, who may need to seamlessly move between
-   networks, or be connected to multiple networks simultaneously.  In
-   these cases, the controlling endpoint may want to maintain multiple
-   potential candidate pairs, and make dynamic decisions to switch
-   between them as conditions change.
-
-   To address these challenges, this document makes two proposals for
-   refactoring ICE nomination - merging Regular and Aggressive
-   Nomination, and introducing a new mode, known as Continuous
-   Nomination.  This makes ICE substantially more flexible without
-   increasing complexity.
-
-2.  Terminology
-
-   The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
-   "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
-   document are to be interpreted as described in [RFC2119].
-
-3.  Goals and Requirements
-
-   The goals for improved ICE nomination are enumerated below.
-
-3.1.  Minimize Call Setup Latency
-
-   Modern ICE agents will often have multiple network interfaces and
-   multiple servers from which to obtain ICE candidates.  While some ICE
-   checks may succeed quickly, finishing the entire set of checks can
-   easily take multiple seconds; this concern is discussed in [RFC5245],
-   Section 8.1.1.1.  As a result, ICE endpoints MUST be able to start
-   transmitting media immediately upon a successful ICE check, and MUST
-   retain the ability to switch if a better candidate pair becomes
-   available later.
-
-3.2.  Allow Controlling Endpoint to Make Dynamic Decisions
-
-   While an ICE endpoint will assign various priority values to its ICE
-   candidates, these priorities are static and can only be based on a
-   priori knowledge; the shortcomings of this approach are discussed in
-   the first paragraph of Section 2.6 in [RFC5245].  To properly make
-   choices in multi-network and multi-server scenarios, the controlling
-   endpoint MUST be able to make dynamic decisions about the selected
-   candidate pair based on observed network performance.  For example,
-   RTT could be used to evaluate which TURN servers to use, as described
-   in [I-D.williams-peer-redirect] To ensure symmetric flows, this
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 3]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-   implies that the controlling endpoint MUST be able to communicate its
-   choice to the controlled side.
-
-3.3.  Allow Selected Pair Change At Any Time Without Signaling
-
-   Expanding on the requirement above, the need to make dynamic
-   decisions is not limited to call setup.  A multihomed endpoint may
-   need to switch interfaces based on mobility considerations, or a
-   robust endpoint may want to keep multiple network paths warm and
-   switch immediately if connectivity is interrupted on one of them.  As
-   the signaling channel may be affected by the event necessitating the
-   switch, this implies that the controlling endpoint MUST be able to
-   change the selected pair and indicate this to the remote side without
-   signaling.  The need for this functionality has been stated in
-   [I-D.wing-mmusic-ice-mobility] and [I-D.singh-avtcore-mprtp].
-
-   The rules in [RFC5245] ensure that the controlled endpoint keeps its
-   candidate needed for the selected pair alive.  However, in order for
-   alternate pairs to remain available, the controlled endpoint must
-   keep the associated candidates alive as well, following the
-   procedures outlined in [RFC5245], Section 4.1.1.4.  This implies that
-   the controlling endpoint MUST have some way to indicate to the
-   controlled side that specific candidates are to be kept alive.
-
-3.4.  Allow Continuous Addition of Candidates
-
-   In certain network mobility scenarios, networks may come up and down
-   while the call is active.  In order to allow candidates gathered on
-   newly available networks to be used for the selected pair or backup
-   pairs, the endpoint MUST be able to gather candidates on these
-   networks and communicate them to the remote side.  While this could
-   be done using an ICE restart, as described in [RFC5245], Section 9.1,
-   the ICE restart may have unintended consequences, such as causing the
-   remote side to regather all candidates.  Instead, it would be best if
-   the new candidates could be trickled, as discussed in
-   [I-D.ietf-mmusic-trickle-ice], but even after ICE processing has
-   completed.
-
-3.5.  Maintain Backwards Compatibility
-
-   To prevent interoperability problems, ICE endpoints that support the
-   functionality listed above MUST still maintain [RFC5245] compliance
-   when interacting with existing endpoints.  However, the ideal
-   solution SHOULD allow some improvements to occur when only the
-   controlling side supports the new functionality.
-
-
-
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 4]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-3.6.  Minimize Complexity Increase
-
-   Increased functionality typically leads to increased complexity,
-   which leads to more edge cases, and more implementation bugs.  This
-   suggests that in addition to proposing new ICE functionality, the
-   ideal solution SHOULD deprecate superfluous functionality.
-
-4.  Deprecating Aggressive Nomination
-
-4.1.  Overview
-
-   The main benefits of Regular Nomination are that the controlling side
-   can dynamically choose which candidate pair to use, and a clear
-   signal when the nomination process has completed, via the presence of
-   the USE-CANDIDATE flag in a Binding Request.  The main benefit of
-   Aggressive Nomination is that it is only necessary to send a single
-   Binding Request before starting the transmission of media, reducing
-   setup latency.  Why don't we have both?
-
-   By preserving the dynamic behavior of Regular Nomination, but
-   allowing media transmission to start upon a single successful
-   connectivity check, as in Aggressive Nomination, the requirements of
-   Section 3.1 and Section 3.2 can be met, while meeting the
-   compatibility requirement from Section 3.5 and, since Aggressive
-   Nomination is no longer needed, the complexity requirement from
-   Section 3.6.
-
-4.2.  Operation
-
-   Since media may be transmitted as soon as all components have a valid
-   pair, as indicated in [RFC5245], Page 69, an ICE Agent can begin
-   transmitting media as soon as this occurs, even if it has not sent a
-   Binding Request with USE-CANDIDATE.
-
-   This pair can change as more pairs are added to the Valid list on the
-   controlling side.  When nomination completes, and a final pair is
-   selected, this is communicated to the controlled side via the typical
-   Binding Request with USE-CANDIDATE.
-
-   On the controlled side, the same process can occur, with the ICE
-   Agent transmitting media as soon as a valid pair exists.  To
-   encourage use of symmetric RTP, the controlled ICE Agent SHOULD use
-   the same candidate pair on which it received media from the
-   controlling side.  [Doesn't need to be secure media, since the
-   controlling side will finalize this preference through USE-CANDIDATE
-   shortly.]
-
-
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 5]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-   As this is legal ICE behavior, no negotiation of this mechanism
-   should be needed.  In the event the receiver drops any packets that
-   arrive before a Binding Request with USE-CANDIDATE set, this will
-   simply lead to brief media clipping and will resolve itself once
-   nomination completes.
-
-4.3.  Backwards Compatibility
-
-   When acting in the controlled role, new implementations MUST NOT use
-   Aggressive Nomination.
-
-   When acting in the controlled role, and the controlling side is using
-   Aggressive Nomination (e.g. sending USE-CANDIDATE in its initial
-   Binding Requests), the standard PRIORITY-based mechanism outlined in
-   [RFC5245], Section 8.1.1.2 should be used to determine the reverse
-   media path.
-
-   Note that if implementations would prefer to just avoid Aggressive
-   Nomination altogether, they MAY indicate some TBD pseudo-option in
-   the ice-options attribute.  Because compliant implementations MUST
-   NOT use Aggressive Nomination if an unknown ICE option is
-   encountered, this effectively prohibits the use of Aggressive
-   Nomination.  [N.B. this could be the ice-options:continuous option
-   described below]
-
-5.  Introducing Continuous Nomination
-
-5.1.  Overview
-
-   As discussed above, in mobile environments there can be multiple
-   possible valid candidate pairs, and these can change at various
-   points in the call, as new interfaces go up and down, signal strength
-   for wireless interfaces changes, and new relay servers are
-   discovered.
-
-   However, under 5245 rules, once a candidate pair is selected and
-   confirmed, via USE-CANDIDATE, nomination has completed and cannot be
-   restarted without performing an ICE restart.  This is overly complex
-   in many cases, and especially problematic in some specific ones,
-   namely a wifi-cellular handover, where the signaling path for
-   communicating an ICE restart may be impacted by the handover.
-
-   To address this situation, this section introduces the concept of
-   "continuous nomination", where the controlling ICE endpoint can
-   adjust the selected candidate pair at any time.  By allowing ICE
-   processing to occur continuously during a call, rather than just at
-   call setup, the requirements expressed in Section 3.3 and Section 3.4
-   can be met.
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 6]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-5.2.  Operation
-
-   Under continuous nomination, ICE never concludes; new candidates can
-   always be trickled, and a new candidate pair can be selected by the
-   controlling side at any time.
-
-   When selecting a new candidate pair, the controlling side informs the
-   controlled side of the chosen pat by sending a new Binding Request
-   with a USE-CANDIDATE attribute.  The decision about which candidate
-   pair to use is fully dynamic; the controlling side can use metrics
-   such as RTT or loss rate to change the selected pair at any time.  If
-   Binding Requests need to be sent for any other reason, such as
-   consent checks [TODO: reference], any checks sent on the selected
-   pair MUST include a USE-CANDIDATE attribute.
-
-   Upon receipt of a Binding Request with USE-CANDIDATE, the controlled
-   side MUST switch its media path to the candidate pair on which the
-   Binding Request was received.
-
-   During continuous nomination, the controlling side may still elect to
-   prune certain candidate pairs; for example, an implementation may
-   choose to drop relay candidates once a successful connection has been
-   established.  The controlled side, however, should follow the
-   controlling side's lead in terms of deciding whether any pairs should
-   be pruned.  The controlling ICE Agent informs the remote side of its
-   preferences by continuing to send Binding Requests to the remote side
-   on each candidate pair that it wants to retain.  The controlled ICE
-   Agent SHOULD prune any candidate pairs that have not received a
-   Binding Request in N seconds (30?), and SHOULD NOT keep alive any
-   candidates that are not associated with a live candidate pair.
-   [TODO: decide if this implicit timeout approach is correct, or if we
-   should have some sort of approach similar to TURN LIFETIME indicating
-   when a pair should be GCed, with LIFETIME==0 indicating immediate
-   GC.]  One side benefit of doing this is that the continuous exchange
-   of Binding Requests across all candidate pairs allows the RTT and
-   loss rate for each to be reliably determined and kept up to date.
-
-   If the endpoints have negotiated Trickle ICE support [TODO:
-   reference], and new candidates become available on either side, the
-   endpoint may send these candidates to the remote side using the
-   existing Trickle ICE mechanisms.  Once all of the new candidates have
-   been transmitted, the endpoint MUST send an end-of-candidates
-   messages, which indicates that no more candidates will be sent in the
-   near future.
-
-   At any point, either side may perform an ICE restart, which will
-   result in both sides gathering new ICE candidates, starting a new
-
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 7]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-   continuous nomination sequence, and upon successful completion,
-   discarding all candidates from the previous nomination sequence.
-
-5.3.  Backwards Compatibility
-
-   Since standard ICE implementations may not expect the selected pair
-   to change after a USE-CANDIDATE attribute is received, support for
-   continuous nomination is explicitly indicated via a new "continuous"
-   value for ice-options.  If the remote side does not support the
-   "continuous" option, the controlling side MUST fall back to Regular
-   Nomination, as specified in [RFC5245], Sectiom 8.1.1.
-
-6.  Examples
-
-   TODO
-
-7.  Security Considerations
-
-   TODO
-
-8.  IANA Considerations
-
-   A new ICE option "continuous" has been [will be] registered in the
-   "ICE Options" registry created by [RFC6336].
-
-9.  Acknowledgements
-
-   Several people provided significant input into this document,
-   including Martin Thomson, Brandon Williams, and Dan Wing.
-
-10.  References
-
-10.1.  Normative References
-
-   [RFC2119]  Bradner, S., "Key words for use in RFCs to Indicate
-              Requirement Levels", BCP 14, RFC 2119, March 1997.
-
-   [RFC5245]  Rosenberg, J., "Interactive Connectivity Establishment
-              (ICE): A Protocol for Network Address Translator (NAT)
-              Traversal for Offer/Answer Protocols", RFC 5245, April
-              2010.
-
-   [RFC6336]  Westerlund, M. and C. Perkins, "IANA Registry for
-              Interactive Connectivity Establishment (ICE) Options", RFC
-              6336, July 2011.
-
-
-
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 8]
-

-Internet-Draft                   IceNom                    November 2014
-
-
-10.2.  Informative References
-
-   [I-D.ietf-mmusic-trickle-ice]
-              Ivov, E., Rescorla, E., and J. Uberti, "Trickle ICE:
-              Incremental Provisioning of Candidates for the Interactive
-              Connectivity Establishment (ICE) Protocol", draft-ietf-
-              mmusic-trickle-ice-01 (work in progress), February 2014.
-
-   [I-D.singh-avtcore-mprtp]
-              Singh, V., Karkkainen, T., Ott, J., Ahsan, S., and L.
-              Eggert, "Multipath RTP (MPRTP)", draft-singh-avtcore-
-              mprtp-09 (work in progress), June 2014.
-
-   [I-D.williams-peer-redirect]
-              Williams, B. and T. Reddy, "Peer-specific Redirection for
-              Traversal Using Relays around NAT (TURN)", draft-williams-
-              peer-redirect-01 (work in progress), June 2014.
-
-   [I-D.wing-mmusic-ice-mobility]
-              Wing, D., Reddy, T., Patil, P., and P. Martinsen,
-              "Mobility with ICE (MICE)", draft-wing-mmusic-ice-
-              mobility-07 (work in progress), June 2014.
-
-Appendix A.  Change log
-
-   Changes in draft -00:
-
-   o  Initial version, from mailing list discussion post-IETF 90.
-
-Authors' Addresses
-
-   Justin Uberti
-   Google
-   747 6th Ave S
-   Kirkland, WA  98033
-   USA
-
-   Email: justin@uberti.name
-
-
-   Jonathan Lennox
-   Vidyo
-   433 Hackensack Avenue
-   Hackensack, NJ  07601
-   USA
-
-   Email: jonathan@vidyo.com
-
-
-
-
-Uberti & Lennox           Expires May 10, 2015                  [Page 9]
-
-

-Html markup produced by rfcmarkup 1.109, available from -https://tools.ietf.org/tools/rfcmarkup/ - - diff --git a/nombis/draft-uberti-mmusic-nombis.txt b/nombis/draft-uberti-mmusic-nombis.txt deleted file mode 100644 index e995189..0000000 --- a/nombis/draft-uberti-mmusic-nombis.txt +++ /dev/null @@ -1,504 +0,0 @@ - - - - -Network Working Group J. Uberti -Internet-Draft Google -Intended status: Standards Track J. Lennox -Expires: May 10, 2015 Vidyo - November 06, 2014 - - - Improvements to ICE Candidate Nomination - draft-uberti-mmusic-nombis-00 - -Abstract - - This document makes recommendations for simplifying and improving the - procedures for candidate nomination in Interactive Connectivity - Establishment (ICE). - -Status of This Memo - - This Internet-Draft is submitted in full conformance with the - provisions of BCP 78 and BCP 79. - - Internet-Drafts are working documents of the Internet Engineering - Task Force (IETF). Note that other groups may also distribute - working documents as Internet-Drafts. The list of current Internet- - Drafts is at http://datatracker.ietf.org/drafts/current/. - - Internet-Drafts are draft documents valid for a maximum of six months - and may be updated, replaced, or obsoleted by other documents at any - time. It is inappropriate to use Internet-Drafts as reference - material or to cite them other than as "work in progress." - - This Internet-Draft will expire on May 10, 2015. - -Copyright Notice - - Copyright (c) 2014 IETF Trust and the persons identified as the - document authors. All rights reserved. - - This document is subject to BCP 78 and the IETF Trust's Legal - Provisions Relating to IETF Documents - (http://trustee.ietf.org/license-info) in effect on the date of - publication of this document. Please review these documents - carefully, as they describe your rights and restrictions with respect - to this document. Code Components extracted from this document must - include Simplified BSD License text as described in Section 4.e of - the Trust Legal Provisions and are provided without warranty as - described in the Simplified BSD License. - - - - -Uberti & Lennox Expires May 10, 2015 [Page 1] - -Internet-Draft IceNom November 2014 - - -Table of Contents - - 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . 2 - 2. Terminology . . . . . . . . . . . . . . . . . . . . . . . . . 3 - 3. Goals and Requirements . . . . . . . . . . . . . . . . . . . 3 - 3.1. Minimize Call Setup Latency . . . . . . . . . . . . . . . 3 - 3.2. Allow Controlling Endpoint to Make Dynamic Decisions . . 3 - 3.3. Allow Selected Pair Change At Any Time Without Signaling 4 - 3.4. Allow Continuous Addition of Candidates . . . . . . . . . 4 - 3.5. Maintain Backwards Compatibility . . . . . . . . . . . . 4 - 3.6. Minimize Complexity Increase . . . . . . . . . . . . . . 5 - 4. Deprecating Aggressive Nomination . . . . . . . . . . . . . . 5 - 4.1. Overview . . . . . . . . . . . . . . . . . . . . . . . . 5 - 4.2. Operation . . . . . . . . . . . . . . . . . . . . . . . . 5 - 4.3. Backwards Compatibility . . . . . . . . . . . . . . . . . 6 - 5. Introducing Continuous Nomination . . . . . . . . . . . . . . 6 - 5.1. Overview . . . . . . . . . . . . . . . . . . . . . . . . 6 - 5.2. Operation . . . . . . . . . . . . . . . . . . . . . . . . 7 - 5.3. Backwards Compatibility . . . . . . . . . . . . . . . . . 8 - 6. Examples . . . . . . . . . . . . . . . . . . . . . . . . . . 8 - 7. Security Considerations . . . . . . . . . . . . . . . . . . . 8 - 8. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 8 - 9. Acknowledgements . . . . . . . . . . . . . . . . . . . . . . 8 - 10. References . . . . . . . . . . . . . . . . . . . . . . . . . 8 - 10.1. Normative References . . . . . . . . . . . . . . . . . . 8 - 10.2. Informative References . . . . . . . . . . . . . . . . . 9 - Appendix A. Change log . . . . . . . . . . . . . . . . . . . . . 9 - Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . 9 - -1. Introduction - - Interactive Connectivity Establishment (ICE) attempts to find the - 'best' path for connectivity between two peers; in ICE parlance, - these paths are known as 'candidate pairs'. During the ICE process, - one endpoint, known as the 'controlling' endpoint, selects a - candidate pair as the best pair; this action is known as nomination. - ICE supports two different mechanisms for performing nomination, - known as Regular Nomination, and Aggressive Nomination. - - However, each of these modes have flaws that restrict their - usefulness. Regular Nomination, as currently speced, requires a best - pair to be chosen before media transmission can start, causing - unnecessary call setup delay. Aggressive Nomination, while avoiding - this delay, gives the controlling endpoint much less discretion into - which candidate pair is chosen, preventing it from making decisions - based on dynamic factors such as RTT or loss rate. Needless to say, - the presence of both modes also adds nontrivial complexity. - - - - -Uberti & Lennox Expires May 10, 2015 [Page 2] - -Internet-Draft IceNom November 2014 - - - Lastly, ICE is currently defined as a finite process, where the - decision on the optimal candidate pair is made during call setup and - infrequently (if ever) changed. While this may be acceptable for - endpoints with static network configurations, it fails to meet the - needs of mobile endpoints, who may need to seamlessly move between - networks, or be connected to multiple networks simultaneously. In - these cases, the controlling endpoint may want to maintain multiple - potential candidate pairs, and make dynamic decisions to switch - between them as conditions change. - - To address these challenges, this document makes two proposals for - refactoring ICE nomination - merging Regular and Aggressive - Nomination, and introducing a new mode, known as Continuous - Nomination. This makes ICE substantially more flexible without - increasing complexity. - -2. Terminology - - The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", - "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this - document are to be interpreted as described in [RFC2119]. - -3. Goals and Requirements - - The goals for improved ICE nomination are enumerated below. - -3.1. Minimize Call Setup Latency - - Modern ICE agents will often have multiple network interfaces and - multiple servers from which to obtain ICE candidates. While some ICE - checks may succeed quickly, finishing the entire set of checks can - easily take multiple seconds; this concern is discussed in [RFC5245], - Section 8.1.1.1. As a result, ICE endpoints MUST be able to start - transmitting media immediately upon a successful ICE check, and MUST - retain the ability to switch if a better candidate pair becomes - available later. - -3.2. Allow Controlling Endpoint to Make Dynamic Decisions - - While an ICE endpoint will assign various priority values to its ICE - candidates, these priorities are static and can only be based on a - priori knowledge; the shortcomings of this approach are discussed in - the first paragraph of Section 2.6 in [RFC5245]. To properly make - choices in multi-network and multi-server scenarios, the controlling - endpoint MUST be able to make dynamic decisions about the selected - candidate pair based on observed network performance. For example, - RTT could be used to evaluate which TURN servers to use, as described - in [I-D.williams-peer-redirect] To ensure symmetric flows, this - - - -Uberti & Lennox Expires May 10, 2015 [Page 3] - -Internet-Draft IceNom November 2014 - - - implies that the controlling endpoint MUST be able to communicate its - choice to the controlled side. - -3.3. Allow Selected Pair Change At Any Time Without Signaling - - Expanding on the requirement above, the need to make dynamic - decisions is not limited to call setup. A multihomed endpoint may - need to switch interfaces based on mobility considerations, or a - robust endpoint may want to keep multiple network paths warm and - switch immediately if connectivity is interrupted on one of them. As - the signaling channel may be affected by the event necessitating the - switch, this implies that the controlling endpoint MUST be able to - change the selected pair and indicate this to the remote side without - signaling. The need for this functionality has been stated in - [I-D.wing-mmusic-ice-mobility] and [I-D.singh-avtcore-mprtp]. - - The rules in [RFC5245] ensure that the controlled endpoint keeps its - candidate needed for the selected pair alive. However, in order for - alternate pairs to remain available, the controlled endpoint must - keep the associated candidates alive as well, following the - procedures outlined in [RFC5245], Section 4.1.1.4. This implies that - the controlling endpoint MUST have some way to indicate to the - controlled side that specific candidates are to be kept alive. - -3.4. Allow Continuous Addition of Candidates - - In certain network mobility scenarios, networks may come up and down - while the call is active. In order to allow candidates gathered on - newly available networks to be used for the selected pair or backup - pairs, the endpoint MUST be able to gather candidates on these - networks and communicate them to the remote side. While this could - be done using an ICE restart, as described in [RFC5245], Section 9.1, - the ICE restart may have unintended consequences, such as causing the - remote side to regather all candidates. Instead, it would be best if - the new candidates could be trickled, as discussed in - [I-D.ietf-mmusic-trickle-ice], but even after ICE processing has - completed. - -3.5. Maintain Backwards Compatibility - - To prevent interoperability problems, ICE endpoints that support the - functionality listed above MUST still maintain [RFC5245] compliance - when interacting with existing endpoints. However, the ideal - solution SHOULD allow some improvements to occur when only the - controlling side supports the new functionality. - - - - - - -Uberti & Lennox Expires May 10, 2015 [Page 4] - -Internet-Draft IceNom November 2014 - - -3.6. Minimize Complexity Increase - - Increased functionality typically leads to increased complexity, - which leads to more edge cases, and more implementation bugs. This - suggests that in addition to proposing new ICE functionality, the - ideal solution SHOULD deprecate superfluous functionality. - -4. Deprecating Aggressive Nomination - -4.1. Overview - - The main benefits of Regular Nomination are that the controlling side - can dynamically choose which candidate pair to use, and a clear - signal when the nomination process has completed, via the presence of - the USE-CANDIDATE flag in a Binding Request. The main benefit of - Aggressive Nomination is that it is only necessary to send a single - Binding Request before starting the transmission of media, reducing - setup latency. Why don't we have both? - - By preserving the dynamic behavior of Regular Nomination, but - allowing media transmission to start upon a single successful - connectivity check, as in Aggressive Nomination, the requirements of - Section 3.1 and Section 3.2 can be met, while meeting the - compatibility requirement from Section 3.5 and, since Aggressive - Nomination is no longer needed, the complexity requirement from - Section 3.6. - -4.2. Operation - - Since media may be transmitted as soon as all components have a valid - pair, as indicated in [RFC5245], Page 69, an ICE Agent can begin - transmitting media as soon as this occurs, even if it has not sent a - Binding Request with USE-CANDIDATE. - - This pair can change as more pairs are added to the Valid list on the - controlling side. When nomination completes, and a final pair is - selected, this is communicated to the controlled side via the typical - Binding Request with USE-CANDIDATE. - - On the controlled side, the same process can occur, with the ICE - Agent transmitting media as soon as a valid pair exists. To - encourage use of symmetric RTP, the controlled ICE Agent SHOULD use - the same candidate pair on which it received media from the - controlling side. [Doesn't need to be secure media, since the - controlling side will finalize this preference through USE-CANDIDATE - shortly.] - - - - - -Uberti & Lennox Expires May 10, 2015 [Page 5] - -Internet-Draft IceNom November 2014 - - - As this is legal ICE behavior, no negotiation of this mechanism - should be needed. In the event the receiver drops any packets that - arrive before a Binding Request with USE-CANDIDATE set, this will - simply lead to brief media clipping and will resolve itself once - nomination completes. - -4.3. Backwards Compatibility - - When acting in the controlled role, new implementations MUST NOT use - Aggressive Nomination. - - When acting in the controlled role, and the controlling side is using - Aggressive Nomination (e.g. sending USE-CANDIDATE in its initial - Binding Requests), the standard PRIORITY-based mechanism outlined in - [RFC5245], Section 8.1.1.2 should be used to determine the reverse - media path. - - Note that if implementations would prefer to just avoid Aggressive - Nomination altogether, they MAY indicate some TBD pseudo-option in - the ice-options attribute. Because compliant implementations MUST - NOT use Aggressive Nomination if an unknown ICE option is - encountered, this effectively prohibits the use of Aggressive - Nomination. [N.B. this could be the ice-options:continuous option - described below] - -5. Introducing Continuous Nomination - -5.1. Overview - - As discussed above, in mobile environments there can be multiple - possible valid candidate pairs, and these can change at various - points in the call, as new interfaces go up and down, signal strength - for wireless interfaces changes, and new relay servers are - discovered. - - However, under 5245 rules, once a candidate pair is selected and - confirmed, via USE-CANDIDATE, nomination has completed and cannot be - restarted without performing an ICE restart. This is overly complex - in many cases, and especially problematic in some specific ones, - namely a wifi-cellular handover, where the signaling path for - communicating an ICE restart may be impacted by the handover. - - To address this situation, this section introduces the concept of - "continuous nomination", where the controlling ICE endpoint can - adjust the selected candidate pair at any time. By allowing ICE - processing to occur continuously during a call, rather than just at - call setup, the requirements expressed in Section 3.3 and Section 3.4 - can be met. - - - -Uberti & Lennox Expires May 10, 2015 [Page 6] - -Internet-Draft IceNom November 2014 - - -5.2. Operation - - Under continuous nomination, ICE never concludes; new candidates can - always be trickled, and a new candidate pair can be selected by the - controlling side at any time. - - When selecting a new candidate pair, the controlling side informs the - controlled side of the chosen pat by sending a new Binding Request - with a USE-CANDIDATE attribute. The decision about which candidate - pair to use is fully dynamic; the controlling side can use metrics - such as RTT or loss rate to change the selected pair at any time. If - Binding Requests need to be sent for any other reason, such as - consent checks [TODO: reference], any checks sent on the selected - pair MUST include a USE-CANDIDATE attribute. - - Upon receipt of a Binding Request with USE-CANDIDATE, the controlled - side MUST switch its media path to the candidate pair on which the - Binding Request was received. - - During continuous nomination, the controlling side may still elect to - prune certain candidate pairs; for example, an implementation may - choose to drop relay candidates once a successful connection has been - established. The controlled side, however, should follow the - controlling side's lead in terms of deciding whether any pairs should - be pruned. The controlling ICE Agent informs the remote side of its - preferences by continuing to send Binding Requests to the remote side - on each candidate pair that it wants to retain. The controlled ICE - Agent SHOULD prune any candidate pairs that have not received a - Binding Request in N seconds (30?), and SHOULD NOT keep alive any - candidates that are not associated with a live candidate pair. - [TODO: decide if this implicit timeout approach is correct, or if we - should have some sort of approach similar to TURN LIFETIME indicating - when a pair should be GCed, with LIFETIME==0 indicating immediate - GC.] One side benefit of doing this is that the continuous exchange - of Binding Requests across all candidate pairs allows the RTT and - loss rate for each to be reliably determined and kept up to date. - - If the endpoints have negotiated Trickle ICE support [TODO: - reference], and new candidates become available on either side, the - endpoint may send these candidates to the remote side using the - existing Trickle ICE mechanisms. Once all of the new candidates have - been transmitted, the endpoint MUST send an end-of-candidates - messages, which indicates that no more candidates will be sent in the - near future. - - At any point, either side may perform an ICE restart, which will - result in both sides gathering new ICE candidates, starting a new - - - - -Uberti & Lennox Expires May 10, 2015 [Page 7] - -Internet-Draft IceNom November 2014 - - - continuous nomination sequence, and upon successful completion, - discarding all candidates from the previous nomination sequence. - -5.3. Backwards Compatibility - - Since standard ICE implementations may not expect the selected pair - to change after a USE-CANDIDATE attribute is received, support for - continuous nomination is explicitly indicated via a new "continuous" - value for ice-options. If the remote side does not support the - "continuous" option, the controlling side MUST fall back to Regular - Nomination, as specified in [RFC5245], Sectiom 8.1.1. - -6. Examples - - TODO - -7. Security Considerations - - TODO - -8. IANA Considerations - - A new ICE option "continuous" has been [will be] registered in the - "ICE Options" registry created by [RFC6336]. - -9. Acknowledgements - - Several people provided significant input into this document, - including Martin Thomson, Brandon Williams, and Dan Wing. - -10. References - -10.1. Normative References - - [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate - Requirement Levels", BCP 14, RFC 2119, March 1997. - - [RFC5245] Rosenberg, J., "Interactive Connectivity Establishment - (ICE): A Protocol for Network Address Translator (NAT) - Traversal for Offer/Answer Protocols", RFC 5245, April - 2010. - - [RFC6336] Westerlund, M. and C. Perkins, "IANA Registry for - Interactive Connectivity Establishment (ICE) Options", RFC - 6336, July 2011. - - - - - - -Uberti & Lennox Expires May 10, 2015 [Page 8] - -Internet-Draft IceNom November 2014 - - -10.2. Informative References - - [I-D.ietf-mmusic-trickle-ice] - Ivov, E., Rescorla, E., and J. Uberti, "Trickle ICE: - Incremental Provisioning of Candidates for the Interactive - Connectivity Establishment (ICE) Protocol", draft-ietf- - mmusic-trickle-ice-01 (work in progress), February 2014. - - [I-D.singh-avtcore-mprtp] - Singh, V., Karkkainen, T., Ott, J., Ahsan, S., and L. - Eggert, "Multipath RTP (MPRTP)", draft-singh-avtcore- - mprtp-09 (work in progress), June 2014. - - [I-D.williams-peer-redirect] - Williams, B. and T. Reddy, "Peer-specific Redirection for - Traversal Using Relays around NAT (TURN)", draft-williams- - peer-redirect-01 (work in progress), June 2014. - - [I-D.wing-mmusic-ice-mobility] - Wing, D., Reddy, T., Patil, P., and P. Martinsen, - "Mobility with ICE (MICE)", draft-wing-mmusic-ice- - mobility-07 (work in progress), June 2014. - -Appendix A. Change log - - Changes in draft -00: - - o Initial version, from mailing list discussion post-IETF 90. - -Authors' Addresses - - Justin Uberti - Google - 747 6th Ave S - Kirkland, WA 98033 - USA - - Email: justin@uberti.name - - - Jonathan Lennox - Vidyo - 433 Hackensack Avenue - Hackensack, NJ 07601 - USA - - Email: jonathan@vidyo.com - - - - -Uberti & Lennox Expires May 10, 2015 [Page 9] diff --git a/vp9/draft-ietf-payload-vp9-00.html b/vp9/draft-ietf-payload-vp9-00.html new file mode 100644 index 0000000..2b38461 --- /dev/null +++ b/vp9/draft-ietf-payload-vp9-00.html @@ -0,0 +1,1147 @@ + + + + + + + RTP Payload Format for VP9 Video + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Payload Working GroupJ. Uberti
Internet-DraftS. Holmer
Intended status: Standards TrackM. Flodman
Expires: April 17, 2016Google
J. Lennox
D. Hong
Vidyo
October 15, 2015
+ +

RTP Payload Format for VP9 Video
+ draft-ietf-payload-vp9-00

+ +

+ Abstract +

+

This memo describes an RTP payload format for the VP9 video codec. The payload format has wide applicability, as it supports applications from low bit-rate peer-to-peer usage, to high bit-rate video conferences. It includes provisions for temporal and spatial scalability.

+

+ Status of This Memo +

+

This Internet-Draft is submitted in full conformance with the provisions of BCP 78 and BCP 79.

+

Internet-Drafts are working documents of the Internet Engineering Task Force (IETF). Note that other groups may also distribute working documents as Internet-Drafts. The list of current Internet-Drafts is at http://datatracker.ietf.org/drafts/current/.

+

Internet-Drafts are draft documents valid for a maximum of six months and may be updated, replaced, or obsoleted by other documents at any time. It is inappropriate to use Internet-Drafts as reference material or to cite them other than as "work in progress."

+

This Internet-Draft will expire on April 17, 2016.

+

+ Copyright Notice +

+

Copyright (c) 2015 IETF Trust and the persons identified as the document authors. All rights reserved.

+

This document is subject to BCP 78 and the IETF Trust's Legal Provisions Relating to IETF Documents (http://trustee.ietf.org/license-info) in effect on the date of publication of this document. Please review these documents carefully, as they describe your rights and restrictions with respect to this document. Code Components extracted from this document must include Simplified BSD License text as described in Section 4.e of the Trust Legal Provisions and are provided without warranty as described in the Simplified BSD License.

+ + +
+

Table of Contents

+ + +

1. Introduction

+

This memo describes an RTP payload specification applicable to the transmission of video streams encoded using the VP9 video codec [I-D.grange-vp9-bitstream]. The format described in this document can be used both in peer-to-peer and video conferencing applications.

+

TODO: VP9 description. Please see [I-D.grange-vp9-bitstream].

+

2. Conventions, Definitions and Acronyms

+

The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC2119].

+

3. Media Format Description

+

The VP9 codec can maintain up to eight reference frames, of which up to three can be referenced or updated by any new frame.

+

VP9 also allows a reference frame to be resampled and used as a reference for another frame of a different resolution. This allows internal resolution changes without requiring the use of key frames.

+

These features together enable an encoder to implement various forms of coarse-grained scalability, including temporal, spatial and quality scalability modes, as well as combinations of these, without the need for explicit scalable coding tools.

+

Temporal layers define different frame rates of video; spatial and quality layers define different and possibly dependent representations of a single input frame. Spatial layers allow a frame to be encoded at different resolutions, whereas quality layers allow a frame to be encoded at the same resolution but at different qualities (and thus with different amounts of coding error). VP9 supports quality layers as spatial layers without any resolution changes; hereinafter, the term "spatial layer" is used to represent both spatial and quality layers.

+

This payload format specification defines how such temporal and spatial scalability layers can be described and communicated.

+

Layers are designed (and MUST be encoded) such that if any layer, and all higher layers, are removed from the bitstream along any of the two dimensions, the remaining bitstream is still correctly decodable.

+

For terminology, this document uses the term "layer frame" to refer to a single encoded VP9 frame for a particular resolution/quality, and "super frame" to refer to all the representations (layer frames) at a single instant in time. A super frame thus consists of one or more layer frames, encoding different spatial layers.

+

Within a super frame, a layer frame with spatial layer ID equal to S, where S > 0, can depend on a frame with a lower spatial layer ID. This "inter-layer" dependency results in additional coding gain to the traditional "inter-picture" dependency, where a frame depends on previously coded frame in time. For simplicity, this payload format assumes that, within a super frame if inter-layer dependency is used, a spatial layer S frame can only depend on spatial layer S-1 frame when S > 0. Additionally, if inter-picture dependency is used, spatial layer S frame is assumed to only depend on prevously coded spatial layer S frame.

+

TODO: Describe how simulcast can be supported?

+

Given above simplifications for inter-layer and inter-picture dependencies, a flag (the D bit described below) is used to indicate whether a spatial layer S frame depends on spatial layer S-1 frame. Given the D bit, a receiver only needs to additionally know the inter-picture dependency structure for a given spatial layer frame in order to determine its decodability. Two modes of describing the inter-picture dependency structure are possible: "flexible mode" and "non-flexible mode". An encoder can only switch between the two on the very first packet of a key frame with temporal layer ID equal to 0.

+

In flexible mode, each packet can contain up to 3 reference indices, which identifies all frames referenced by the frame transmitted in the current packet for inter-picture prediction. This (along with the D bit) enables a receiver to identify if a frame is decodable or not and helps it understand the temporal layer structure so that it can drop packets as it sees fit. Since this is signaled in each packet it makes it possible to have very flexible temporal layer hierarchies and patterns which are changing dynamically.

+

In non-flexible mode, the inter-picture dependency (the reference indices) of a group of frames (GOF) MUST be pre-specified as part of the scalability structure (SS) data. In this mode, each packet MUST have an index to refer to one of the described frames in the GOF, from which the frames referenced by the frame transmitted in the current packet for inter-picture prediction can be identified.

+

The SS data can also be used to specify the resolution of each spatial layer present in the VP9 stream for both flexible and non-flexible modes.

+

4. Payload Format

+

This section describes how the encoded VP9 bitstream is encapsulated in RTP. To handle network losses usage of RTP/AVPF [RFC4585] is RECOMMENDED. All integer fields in the specifications are encoded as unsigned integers in network octet order.

+

4.1. RTP Header Usage

+
+
+

The general RTP payload format for VP9 is depicted below.

+
+   0                   1                   2                   3
+   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |V=2|P|X|  CC   |M|     PT      |       sequence number         |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |                           timestamp                           |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |           synchronization source (SSRC) identifier            |
+  +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
+  |            contributing source (CSRC) identifiers             |
+  |                             ....                              |
+  +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
+  |            VP9 payload descriptor (integer #bytes)            |
+  :                                                               :
+  |                               +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |                               : VP9 pyld hdr  |               |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+               |
+  |                                                               |
+  +                                                               |
+  :                   Bytes 2..N of VP9 payload                   :
+  |                                                               |
+  |                               +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |                               :    OPTIONAL RTP padding       |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+          
+

The VP9 payload descriptor and VP9 payload header will be described in the next section. OPTIONAL RTP padding MUST NOT be included unless the P bit is set.

+

Figure 1

+

+ +

+
Marker bit (M):
+
MUST be set to 1 for the final packet of the highest spatial layer frame (the final packet of the super frame), and 0 otherwise. Unless spatial scalability is in use for this super frame, this will have the same value as the E bit described below. Note that a MANE MUST set this value to 1 for the target spatial layer frame when shaping out higher spatial layers.
+
Timestamp:
+
The RTP timestamp indicates the time when the input frame was sampled, at a clock rate of 90 kHz. If the input frame is encoded with multiple layer frames, all of the layer frames of the super frame MUST have the same timestamp.
+
Sequence number:
+
The sequence numbers are monotonically increasing in order of the encoded bitstream.
+
+

The remaining RTP header fields are used as specified in [RFC3550].

+

4.2. VP9 Payload Description

+
+
+

In flexible mode (with the F bit below set to 1), The first octets after the RTP header are the VP9 payload descriptor, with the following structure.

+
+      0 1 2 3 4 5 6 7
+     +-+-+-+-+-+-+-+-+
+     |I|P|L|F|B|E|V|-| (REQUIRED)
+     +-+-+-+-+-+-+-+-+
+I:   |M| PICTURE ID  | (REQUIRED)
+     +-+-+-+-+-+-+-+-+
+M:   | EXTENDED PID  | (RECOMMENDED)
+     +-+-+-+-+-+-+-+-+
+L:   |  T  |U|  S  |D| (CONDITIONALLY RECOMMENDED)
+     +-+-+-+-+-+-+-+-+                             -\
+P,F: | P_DIFF      |N| (CONDITIONALLY REQUIRED)    - up to 3 times
+     +-+-+-+-+-+-+-+-+                             -/
+V:   | SS            |
+     | ..            |
+     +-+-+-+-+-+-+-+-+
+
+            
+

Figure 2

+
+
+

In non-flexible mode (with the F bit below set to 0), The first octets after the RTP header are the VP9 payload descriptor, with the following structure.

+
+      0 1 2 3 4 5 6 7
+     +-+-+-+-+-+-+-+-+
+     |I|P|L|F|B|E|V|-| (REQUIRED)
+     +-+-+-+-+-+-+-+-+
+I:   |M| PICTURE ID  | (RECOMMENDED)
+     +-+-+-+-+-+-+-+-+
+M:   | EXTENDED PID  | (RECOMMENDED)
+     +-+-+-+-+-+-+-+-+
+L:   |  T  |U|  S  |D| (CONDITIONALLY RECOMMENDED)
+     +-+-+-+-+-+-+-+-+
+     |   TL0PICIDX   | (CONDITIONALLY REQUIRED)
+     +-+-+-+-+-+-+-+-+
+V:   | SS            |
+     | ..            |
+     +-+-+-+-+-+-+-+-+
+
+            
+

Figure 3

+

+ +

+
I:
+
Picture ID (PID) present. When set to one, the OPTIONAL PID MUST be present after the mandatory first octet and specified as below. Otherwise, PID MUST NOT be present.
+
P:
+
Inter-picture predicted layer frame. When set to zero, the layer frame does not utilize inter-picture prediction. In this case, up-switching to current spatial layer's frame is possible from directly lower spatial layer frame. P SHOULD also be set to zero when encoding a layer synchronization frame in response to an LRR [I-D.lennox-avtext-lrr]. When P is set to zero, the T bit (described below) MUST also be set to 0 (if present).
+
L:
+
Layer indices present. When set to one, the one or two octets following the mandatory first octet and the PID (if present) is as described by "Layer indices" below. If the F bit (described below) is set to 1 (indicating flexible mode), then only one octet is present for the layer indices. Otherwise if the F bit is set to 0 (indicating non-flexible mode), then two octets are present for the layer indices.
+
F:
+
Flexible mode. F set to one indicates flexible mode and if the P bit is also set to one, then the octets following the mandatory first octet, the PID, and layer indices (if present) are as described by "Reference indices" below. This MUST only be set to 1 if the I bit is also set to one; if the I bit is set to zero, then this MUST also be set to zero and ignored by receivers. The value of this F bit CAN ONLY CHANGE on the very first packet of a key picture. This is a packet with the P bit equal to zero, S or D bit (described below) equal to zero, and B bit (described below) equal to 1.
+
B:
+
Start of a layer frame. MUST be set to 1 if the first payload octet of the RTP packet is the beginning of a new VP9 layer frame, and MUST NOT be 1 otherwise. Note that this layer frame might not be the very first layer frame of a super frame.
+
E:
+
End of a layer frame. MUST be set to 1 for the final RTP packet of a VP9 layer frame, and 0 otherwise. This enables a decoder to finish decoding the layer frame, where it otherwise may need to wait for the next packet to explicitly know that the layer frame is complete. Note that, if spatial scalability is in use, more layer frames from the same super frame may follow; see the description of the M bit above.
+
V:
+
Scalability structure (SS) data present. When set to one, the OPTIONAL SS data MUST be present in the payload descriptor. Otherwise, the SS data MUST NOT be present.
+
-:
+
Bit reserved for future use. MUST be set to zero and MUST be ignored by the receiver.
+
+

The mandatory first octet is followed by the extension data fields that are enabled:

+ +
+
M:
+
The most significant bit of the first octet is an extension flag. The field MUST be present if the I bit is equal to one. If set, the PID field MUST contain 15 bits; otherwise, it MUST contain 7 bits. See PID below.
+
Picture ID (PID):
+
Picture ID represented in 7 or 15 bits, depending on the M bit. This is a running index of the pictures. The field MUST be present if the I bit is equal to one. If M is set to zero, 7 bits carry the PID; else if M is set to one, 15 bits carry the PID. The sender may choose between 7 or 15 bits index. The PID SHOULD start on a random number, and MUST wrap after reaching the maximum ID. The receiver MUST NOT assume that the number of bits in PID stay the same through the session.
+
+
In the non-flexible mode (when the F bit is set to 0), this PID is used as an index to the GOF specified in the SS data bleow. In this mode, the PID of the key frame corresponds to the very first specified frame in the GOF. Then subsequent PIDs are mapped to subsequently specified frames in the GOF (modulo N_G, specified in the SS data below), respectively.
+
Layer indices:
+
This information is optional but recommended whenever encoding with layers. For both flexible and non-flexible modes, one octet is used to specify a layer frame's temporal layer ID (T) and spatial layer ID (S) as shown both in Figure 2 and Figure 3. Additionally, a bit (U) is used to indcate that the current frame is a "switching up point" frame. Another bit (D) is used to indicate whether inter-layer prediction is used for the current layer frame.
+
+
In the non-flexible mode (when the F bit is set to 0), another octet is used to represent temporal layer 0 index (TL0PICIDX), as depicted in Figure 3. The TL0PICIDX is present so that all minimally required frames - the base temporal layer frames - can be tracked.
+
+
The T and S fields, indicate the temporal and spatial layers and can help MCUs measure bitrates per layer and can help them make a quick decision on whether to relay a packet or not. They can also help receivers determine what layers they are currently decoding.
T:
The temporal layer ID of currenent frame. In the case of non-flexible mode, if PID is mapped to a frame in a specified GOF, then the value of T MUST match the corresponding T value of the mapped frame in the GOF.
U:
Switching up point. If this bit is set to 1 for the current frame with temporal layer ID equal to T, then "switch up" to a higher frame rate is possible as subsequent higher temporal layer frames will not depend on any frame before the current frame (in coding time) with temporal layer ID greater than T.
S:
The spatial layer ID of current frame. Note that frames with spatial layer S > 0 may be dependent on decoded spatial layer S-1 frame within the same super frame.
D:
Inter-layer dependency used. MUST be set to one if current spatial layer S frame depends on spatial layer S-1 frame of the same super frame. MUST only be set to zero if current spatial layer S frame does not depend on spatial layer S-1 frame of the same super frame. For the base layer frame with S equal to 0, this D bit MUST be set to zero.
TL0PICIDX:
8 bits temporal layer zero index. TL0PICIDX is only present in the non-flexible mode (F = 0). This is a running index for the temporal base layer frames, i.e., the frames with T set to 0. If T is larger than 0, TL0PICIDX indicates which temporal base layer frame the current frame depends on. TL0PICIDX MUST be incremented when T is equal to 0. The index SHOULD start on a random number, and MUST restart at 0 after reaching the maximum number 255.
+
Reference indices:
+
When P and F are both set to one, indicating a non-key frame in flexible mode, then at least one reference index has to be specified as below. Additional reference indices (total of up to 3 reference indices are allowed) may be specified using the N bit below. When either P or F is set to zero, then no reference index is specified.
P_DIFF:
The reference index (in 7 bits) specified as the relative PID from the current frame. For example, when P_DIFF=3 on a packet containing the frame with PID 112 means that the frame refers back to the frame with PID 109. This calculation is done modulo the size of the PID field, i.e., either 7 or 15 bits.
N:
1 if there is additional P_DIFF following the current P_DIFF.
+
+

4.2.1. Scalability Structure (SS):

+

The scalability structure (SS) data describes the resolution of each layer frame within a super frame as well as the inter-picture dependencies for a group of frames (GOF). If the VP9 payload descriptor's "V" bit is set, the SS data is present in the position indicated in Figure 2 and Figure 3.

+
+
+
+     +-+-+-+-+-+-+-+-+
+V:   | N_S |Y|G|-|-|-|
+     +-+-+-+-+-+-+-+-+              -\
+Y:   |     WIDTH     | (OPTIONAL)    .
+     +               +               .
+     |               | (OPTIONAL)    .
+     +-+-+-+-+-+-+-+-+               . - N_S + 1 times
+     |     HEIGHT    | (OPTIONAL)    .
+     +               +               .
+     |               | (OPTIONAL)    .
+     +-+-+-+-+-+-+-+-+              -/            -\
+G:   |      N_G      | (OPTIONAL)
+     +-+-+-+-+-+-+-+-+                            -\
+N_G: |  T  |U| R |-|-| (OPTIONAL)                 .
+     +-+-+-+-+-+-+-+-+              -\            . - N_G times
+     |    P_DIFF     | (OPTIONAL)    . - R times  .
+     +-+-+-+-+-+-+-+-+              -/            -/
+            
+

Figure 4

+

+ +

+
N_S:
+
N_S + 1 indicates the number of spatial layers present in the VP9 stream.
+
Y:
+
Each spatial layer's frame resolution present. When set to one, the OPTIONAL WIDTH (2 octets) and HEIGHT (2 octets) MUST be present for each layer frame. Otherwise, the resolution MUST NOT be present.
+
G:
+
GOF description present flag.
+
-:
+
Bit reserved for future use. MUST be set to zero and MUST be ignored by the receiver.
+
N_G:
+
N_G indicates the number of frames in a GOF. If N_G is greater than 0, then the SS data allows the inter-picture dependency structure of the VP9 stream to be pre-declared, rather than indicating it on the fly with every packet. If N_G is greater than 0, then for N_G pictures in the GOF, each frame's temporal layer ID (T), switch up point (U), and the R reference indices (P_DIFFs) are specified.
+
+
The very first frame specified in the GOF MUST have T set to 0.
+
+
G set to 0 or N_G set to 0 indicates that either there is only one temporal layer or no fixed inter-picture dependency information is present going forward in the bitstream.
+
+
Note that for a given super frame, all layer frames follow the same inter-picture dependency structure. However, the frame rate of each spatial layer can be different from each other and this can be controlled with the use of the D bit described above. The specified dependency structure in the SS data MUST be for the highest frame rate layer.
+
+

In a scalable stream sent with a fixed pattern, the SS data SHOULD be included in the first packet of every key frame. This is a packet with P bit equal to zero, S or D bit equal to zero, and B bit equal to 1. The SS data MUST only be changed on the frame that corresponds to the very first frame specified in the previous SS data's GOF (if the previous SS data's N_G was greater than 0).

+

4.3. VP9 Payload Header

+

TODO: need to describe VP9 payload header.

+

4.4. Frame Fragmentation

+

VP9 frames are fragmented into packets, in RTP sequence number order, beginning with a packet with the B bit set, and ending with a packet with the RTP marker bit set. There is no mechanism for finer-grained access to parts of a VP9 frame.

+

4.5. Examples of VP9 RTP Stream

+

TODO

+

5. Using VP9 with RPSI and SLI Feedback

+

The VP9 payload descriptor defined in Section 4.2 above contains an optional PictureID parameter. One use of this parameter is included to enable use of reference picture selection index (RPSI) and slice loss indication (SLI), both defined in [RFC4585].

+

5.1. RPSI

+

TODO: Update to indicate which frame within the picture.

+

The reference picture selection index is a payload-specific feedback message defined within the RTCP-based feedback format. The RPSI message is generated by a receiver and can be used in two ways. Either it can signal a preferred reference picture when a loss has been detected by the decoder -- preferably then a reference that the decoder knows is perfect -- or, it can be used as positive feedback information to acknowledge correct decoding of certain reference pictures. The positive feedback method is useful for VP9 used as unicast. The use of RPSI for VP9 is preferably combined with a special update pattern of the codec's two special reference frames -- the golden frame and the altref frame -- in which they are updated in an alternating leapfrog fashion. When a receiver has received and correctly decoded a golden or altref frame, and that frame had a PictureID in the payload descriptor, the receiver can acknowledge this simply by sending an RPSI message back to the sender. The message body (i.e., the "native RPSI bit string" in [RFC4585]) is simply the PictureID of the received frame.

+

5.2. SLI

+

TODO: Update to indicate which frame within the picture.

+

The slice loss indication is another payload-specific feedback message defined within the RTCP-based feedback format. The SLI message is generated by the receiver when a loss or corruption is detected in a frame. The format of the SLI message is as follows [RFC4585]:

+
+
+
+   0                   1                   2                   3
+   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |         First           |        Number           | PictureID |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            
+

Figure 5

+

Here, First is the macroblock address (in scan order) of the first lost block and Number is the number of lost blocks. PictureID is the six least significant bits of the codec-specific picture identifier in which the loss or corruption has occurred. For VP9, this codec-specific identifier is naturally the PictureID of the current frame, as read from the payload descriptor. If the payload descriptor of the current frame does not have a PictureID, the receiver MAY send the last received PictureID+1 in the SLI message. The receiver MAY set the First parameter to 0, and the Number parameter to the total number of macroblocks per frame, even though only parts of the frame is corrupted. When the sender receives an SLI message, it can make use of the knowledge from the latest received RPSI message. Knowing that the last golden or altref frame was successfully received, it can encode the next frame with reference to that established reference.

+

5.3. Example

+

TODO: this example is copied from the VP8 payload format specification, and has not been updated for VP9. It may be incorrect.

+

The use of RPSI and SLI is best illustrated in an example. In this example, the encoder may not update the altref frame until the last sent golden frame has been acknowledged with an RPSI message. If an update is not received within some time, a new golden frame update is sent instead. Once the new golden frame is established and acknowledged, the same rule applies when updating the altref frame.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Example signaling between sender and receiver
EventSenderReceiverEstablished reference
1000Send golden frame PictureID = 0 + +
+ + Receive and decode golden frame +
1001 + Send RPSI(0) +
1002Receive RPSI(0) + golden
...(sending regular frames) + +
1100Send altref frame PictureID = 100 + +
+ + Altref corrupted or lostgolden
1101 + Send SLI(100)golden
1102Receive SLI(100) + +
1103Send frame with reference to golden + +
+ + Receive and decode frame (decoder state restored)golden
...(sending regular frames) + +
1200Send altref frame PictureID = 200 + +
+ + Receive and decode altref framegolden
1201 + Send RPSI(200) +
1202Receive RPSI(200) + altref
...(sending regular frames) + +
1300Send golden frame PictureID = 300 + +
+ + Receive and decode golden framealtref
1301 + Send RPSI(300)altref
1302RPSI lost + +
1400Send golden frame PictureID = 400 + +
+ + Receive and decode golden framealtref
1401 + Send RPSI(400) +
1402Receive RPSI(400) + golden
+

Note that the scheme is robust to loss of the feedback messages. If the RPSI is lost, the sender will try to update the golden (or altref) again after a while, without releasing the established reference. Also, if an SLI is lost, the receiver can keep sending SLI messages at any interval allowed by the RTCP sending timing restrictions as specified in [RFC4585], as long as the picture is corrupted.

+

6. Payload Format Parameters

+

This payload format has two required parameters.

+

6.1. Media Type Definition

+

This registration is done using the template defined in [RFC6838] and following [RFC4855].

+ +
+
Type name:
+
video
+
Subtype name:
+
VP9
+
Required parameters:
+

These parameters MUST be used to signal the capabilities of a receiver implementation. These parameters MUST NOT be used for any other purpose.
max-fr:
The value of max-fr is an integer indicating the maximum frame rate in units of frames per second that the decoder is capable of decoding.
max-fs:
The value of max-fs is an integer indicating the maximum frame size in units of macroblocks that the decoder is capable of decoding.
The decoder is capable of decoding this frame size as long as the width and height of the frame in macroblocks are less than int(sqrt(max-fs * 8)) - for instance, a max-fs of 1200 (capable of supporting 640x480 resolution) will support widths and heights up to 1552 pixels (97 macroblocks).
+
Encoding considerations:
+

This media type is framed in RTP and contains binary data; see Section 4.8 of [RFC6838].
+
Security considerations:
+
See Section 7 of RFC xxxx.
[RFC Editor: Upon publication as an RFC, please replace "XXXX" with the number assigned to this document and remove this note.]
+
Interoperability considerations:
+
None.
+
Published specification:
+
VP9 bitstream format [I-D.grange-vp9-bitstream] and RFC XXXX.
[RFC Editor: Upon publication as an RFC, please replace "XXXX" with the number assigned to this document and remove this note.]
+
Applications which use this media type:
+

For example: Video over IP, video conferencing.
+
Fragment identifier considerations:
+
N/A.
+
Additional information:
+
None.
+
Person & email address to contact for further information:
+

TODO [Pick a contact]
+
Intended usage:
+
COMMON
+
Restrictions on usage:
+

This media type depends on RTP framing, and hence is only defined for transfer via RTP [RFC3550].
+
Author:
+
TODO [Pick a contact]
+
Change controller:
+

IETF Payload Working Group delegated from the IESG.
+
+

6.2. SDP Parameters

+

The receiver MUST ignore any fmtp parameter unspecified in this memo.

+

6.2.1. Mapping of Media Subtype Parameters to SDP

+

The media type video/VP9 string is mapped to fields in the Session Description Protocol (SDP) [RFC4566] as follows:

+ +
    +
  • The media name in the "m=" line of SDP MUST be video.
  • +
  • The encoding name in the "a=rtpmap" line of SDP MUST be VP9 (the media subtype).
  • +
  • The clock rate in the "a=rtpmap" line MUST be 90000.
  • +
  • The parameters "max-fs", and "max-fr", MUST be included in the "a=fmtp" line of SDP if SDP is used to declare receiver capabilities. These parameters are expressed as a media subtype string, in the form of a semicolon separated list of parameter=value pairs.
  • +
+

6.2.1.1. Example

+

An example of media representation in SDP is as follows:

+

m=video 49170 RTP/AVPF 98
a=rtpmap:98 VP9/90000
a=fmtp:98 max-fr=30; max-fs=3600;

+

6.2.2. Offer/Answer Considerations

+

TODO: Update this for VP9

+

7. Security Considerations

+

RTP packets using the payload format defined in this specification are subject to the security considerations discussed in the RTP specification [RFC3550], and in any applicable RTP profile. The main security considerations for the RTP packet carrying the RTP payload format defined within this memo are confidentiality, integrity and source authenticity. Confidentiality is achieved by encryption of the RTP payload. Integrity of the RTP packets through suitable cryptographic integrity protection mechanism. Cryptographic system may also allow the authentication of the source of the payload. A suitable security mechanism for this RTP payload format should provide confidentiality, integrity protection and at least source authentication capable of determining if an RTP packet is from a member of the RTP session or not. Note that the appropriate mechanism to provide security to RTP and payloads following this memo may vary. It is dependent on the application, the transport, and the signaling protocol employed. Therefore a single mechanism is not sufficient, although if suitable the usage of SRTP [RFC3711] is recommended. This RTP payload format and its media decoder do not exhibit any significant non-uniformity in the receiver-side computational complexity for packet processing, and thus are unlikely to pose a denial-of-service threat due to the receipt of pathological data. Nor does the RTP payload format contain any active content.

+

8. Congestion Control

+

Congestion control for RTP SHALL be used in accordance with RFC 3550 [RFC3550], and with any applicable RTP profile; e.g., RFC 3551 [RFC3551]. The congestion control mechanism can, in a real-time encoding scenario, adapt the transmission rate by instructing the encoder to encode at a certain target rate. Media aware network elements MAY use the information in the VP9 payload descriptor in Section 4.2 to identify non-reference frames and discard them in order to reduce network congestion. Note that discarding of non-reference frames cannot be done if the stream is encrypted (because the non-reference marker is encrypted).

+

9. IANA Considerations

+

The IANA is requested to register the following values:
- Media type registration as described in Section 6.1.

+

10. References

+

10.1. Normative References

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ [I-D.grange-vp9-bitstream] + Grange, A. and H. Alvestrand, "A VP9 Bitstream Overview", Internet-Draft draft-grange-vp9-bitstream-00, February 2013.
+ [I-D.lennox-avtext-lrr] + Lennox, J., Hong, D., Uberti, J., Holmer, S. and M. Flodman, "The Layer Refresh Request (LRR) RTCP Feedback Message", Internet-Draft draft-lennox-avtext-lrr-00, March 2015.
+ [RFC2119] + Bradner, S., "Key words for use in RFCs to Indicate Requirement Levels", BCP 14, RFC 2119, DOI 10.17487/RFC2119, March 1997.
+ [RFC3550] + Schulzrinne, H., Casner, S., Frederick, R. and V. Jacobson, "RTP: A Transport Protocol for Real-Time Applications", STD 64, RFC 3550, DOI 10.17487/RFC3550, July 2003.
+ [RFC4566] + Handley, M., Jacobson, V. and C. Perkins, "SDP: Session Description Protocol", RFC 4566, DOI 10.17487/RFC4566, July 2006.
+ [RFC4585] + Ott, J., Wenger, S., Sato, N., Burmeister, C. and J. Rey, "Extended RTP Profile for Real-time Transport Control Protocol (RTCP)-Based Feedback (RTP/AVPF)", RFC 4585, DOI 10.17487/RFC4585, July 2006.
+ [RFC4855] + Casner, S., "Media Type Registration of RTP Payload Formats", RFC 4855, DOI 10.17487/RFC4855, February 2007.
+ [RFC6838] + Freed, N., Klensin, J. and T. Hansen, "Media Type Specifications and Registration Procedures", BCP 13, RFC 6838, DOI 10.17487/RFC6838, January 2013.
+

10.2. Informative References

+ + + + + + + + + + + +
+ [RFC3551] + Schulzrinne, H. and S. Casner, "RTP Profile for Audio and Video Conferences with Minimal Control", STD 65, RFC 3551, DOI 10.17487/RFC3551, July 2003.
+ [RFC3711] + Baugher, M., McGrew, D., Naslund, M., Carrara, E. and K. Norrman, "The Secure Real-time Transport Protocol (SRTP)", RFC 3711, DOI 10.17487/RFC3711, March 2004.
+

+ Authors' Addresses +

+
+
+ + Justin Uberti + + + Google, Inc. + + 747 6th Street South + + + Kirkland, + WA + 98033 + + USA + + EMail: justin@uberti.name + +
+
+
+ + Stefan Holmer + + + Google, Inc. + + Kungsbron 2 + + + Stockholm, + + 111 22 + + Sweden + + EMail: holmer@google.com + +
+
+
+ + Magnus Flodman + + + Google, Inc. + + Kungsbron 2 + + + Stockholm, + + 111 22 + + Sweden + + EMail: mflodman@google.com + +
+
+
+ + Jonathan Lennox + + + Vidyo, Inc. + + 433 Hackensack Avenue +Seventh Floor + + + Hackensack, + NJ + 07601 + + US + + EMail: jonathan@vidyo.com + +
+
+
+ + Danny Hong + + + Vidyo, Inc. + + 433 Hackensack Avenue +Seventh Floor + + + Hackensack, + NJ + 07601 + + US + + EMail: danny@vidyo.com + +
+
+ + + diff --git a/vp9/draft-uberti-payload-vp9-00.txt b/vp9/draft-ietf-payload-vp9-00.txt similarity index 54% rename from vp9/draft-uberti-payload-vp9-00.txt rename to vp9/draft-ietf-payload-vp9-00.txt index e76d949..cc9c8fc 100644 --- a/vp9/draft-uberti-payload-vp9-00.txt +++ b/vp9/draft-ietf-payload-vp9-00.txt @@ -5,14 +5,15 @@ Payload Working Group J. Uberti Internet-Draft S. Holmer Intended status: Standards Track M. Flodman -Expires: April 30, 2015 Google +Expires: April 17, 2016 Google J. Lennox + D. Hong Vidyo - October 27, 2014 + October 15, 2015 RTP Payload Format for VP9 Video - draft-uberti-payload-vp9-00 + draft-ietf-payload-vp9-00 Abstract @@ -37,11 +38,11 @@ Status of This Memo time. It is inappropriate to use Internet-Drafts as reference material or to cite them other than as "work in progress." - This Internet-Draft will expire on April 30, 2015. + This Internet-Draft will expire on April 17, 2016. Copyright Notice - Copyright (c) 2014 IETF Trust and the persons identified as the + Copyright (c) 2015 IETF Trust and the persons identified as the document authors. All rights reserved. This document is subject to BCP 78 and the IETF Trust's Legal @@ -49,15 +50,15 @@ Copyright Notice (http://trustee.ietf.org/license-info) in effect on the date of publication of this document. Please review these documents carefully, as they describe your rights and restrictions with respect - to this document. Code Components extracted from this document must -Uberti, et al. Expires April 30, 2015 [Page 1] +Uberti, et al. Expires April 17, 2016 [Page 1] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 + to this document. Code Components extracted from this document must include Simplified BSD License text as described in Section 4.e of the Trust Legal Provisions and are provided without warranty as described in the Simplified BSD License. @@ -70,26 +71,26 @@ Table of Contents 4. Payload Format . . . . . . . . . . . . . . . . . . . . . . . 4 4.1. RTP Header Usage . . . . . . . . . . . . . . . . . . . . 4 4.2. VP9 Payload Description . . . . . . . . . . . . . . . . . 6 - 4.2.1. Scalability Structure (SS): . . . . . . . . . . . . . 8 - 4.2.2. Scalability Structure Update (SU): . . . . . . . . . 9 - 4.3. VP9 Payload Header . . . . . . . . . . . . . . . . . . . 10 - 4.4. Frame Fragmentation . . . . . . . . . . . . . . . . . . . 10 - 4.5. Examples of VP9 RTP Stream . . . . . . . . . . . . . . . 10 - 5. Using VP9 with RPSI and SLI Feedback . . . . . . . . . . . . 10 - 5.1. RPSI . . . . . . . . . . . . . . . . . . . . . . . . . . 10 - 5.2. SLI . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 - 5.3. Example . . . . . . . . . . . . . . . . . . . . . . . . . 11 - 6. Layer Intra Request . . . . . . . . . . . . . . . . . . . . . 13 - 7. Payload Format Parameters . . . . . . . . . . . . . . . . . . 14 - 7.1. Media Type Definition . . . . . . . . . . . . . . . . . . 14 - 7.2. SDP Parameters . . . . . . . . . . . . . . . . . . . . . 15 - 7.2.1. Mapping of Media Subtype Parameters to SDP . . . . . 16 - 7.2.2. Offer/Answer Considerations . . . . . . . . . . . . . 16 - 8. Security Considerations . . . . . . . . . . . . . . . . . . . 16 - 9. Congestion Control . . . . . . . . . . . . . . . . . . . . . 17 - 10. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 17 - 11. References . . . . . . . . . . . . . . . . . . . . . . . . . 17 - Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . 18 + 4.2.1. Scalability Structure (SS): . . . . . . . . . . . . . 10 + 4.3. VP9 Payload Header . . . . . . . . . . . . . . . . . . . 12 + 4.4. Frame Fragmentation . . . . . . . . . . . . . . . . . . . 12 + 4.5. Examples of VP9 RTP Stream . . . . . . . . . . . . . . . 12 + 5. Using VP9 with RPSI and SLI Feedback . . . . . . . . . . . . 12 + 5.1. RPSI . . . . . . . . . . . . . . . . . . . . . . . . . . 12 + 5.2. SLI . . . . . . . . . . . . . . . . . . . . . . . . . . . 13 + 5.3. Example . . . . . . . . . . . . . . . . . . . . . . . . . 14 + 6. Payload Format Parameters . . . . . . . . . . . . . . . . . . 16 + 6.1. Media Type Definition . . . . . . . . . . . . . . . . . . 16 + 6.2. SDP Parameters . . . . . . . . . . . . . . . . . . . . . 17 + 6.2.1. Mapping of Media Subtype Parameters to SDP . . . . . 17 + 6.2.2. Offer/Answer Considerations . . . . . . . . . . . . . 18 + 7. Security Considerations . . . . . . . . . . . . . . . . . . . 18 + 8. Congestion Control . . . . . . . . . . . . . . . . . . . . . 18 + 9. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 19 + 10. References . . . . . . . . . . . . . . . . . . . . . . . . . 19 + 10.1. Normative References . . . . . . . . . . . . . . . . . . 19 + 10.2. Informative References . . . . . . . . . . . . . . . . . 20 + Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . 20 1. Introduction @@ -108,10 +109,9 @@ Table of Contents - -Uberti, et al. Expires April 30, 2015 [Page 2] +Uberti, et al. Expires April 17, 2016 [Page 2] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 3. Media Format Description @@ -121,67 +121,88 @@ Internet-Draft RTP Payload Format for VP9 October 2014 VP9 also allows a reference frame to be resampled and used as a reference for another frame of a different resolution. This allows - internal resolution changes without requiring the use of keyframes. + internal resolution changes without requiring the use of key frames. These features together enable an encoder to implement various forms - of coarse-grained scalability, including temporal, spatial, and + of coarse-grained scalability, including temporal, spatial and quality scalability modes, as well as combinations of these, without - the need for explicit spatially scalabile encoding modes. - - This payload format specification defines how such scalability modes - can be encoded and communicated. In this payload, three separate - types of layers are defined: temporal, spatial, and quality. + the need for explicit scalable coding tools. Temporal layers define different frame rates of video; spatial and - quality layers define different, dependent representations of a - single picture. Spatial layers allow a picture to be encoded at - different resolutions, whereas quality layers allow a picture to be - encoded at the same resolution but at different bitrates (and thus - with different amounts of coding error). + quality layers define different and possibly dependent + representations of a single input frame. Spatial layers allow a + frame to be encoded at different resolutions, whereas quality layers + allow a frame to be encoded at the same resolution but at different + qualities (and thus with different amounts of coding error). VP9 + supports quality layers as spatial layers without any resolution + changes; hereinafter, the term "spatial layer" is used to represent + both spatial and quality layers. + + This payload format specification defines how such temporal and + spatial scalability layers can be described and communicated. Layers are designed (and MUST be encoded) such that if any layer, and all higher layers, are removed from the bitstream along any of the - three dimensions, the remaining bitstream is still correctly - decodable. - - For terminology, this document uses the term "frame" to refer to a - single encoded VP9 image, and "picture" to refer to all the - representations of frames at a single instant in time. A picture - thus can consist of multiple frames, encoding different spatial and/ - or quality layers. + two dimensions, the remaining bitstream is still correctly decodable. - [Editor's Note: Are separate spatial and quality layers necessary and - useful? We could simplify by only defining a single sequence of - frames within a picture. + For terminology, this document uses the term "layer frame" to refer + to a single encoded VP9 frame for a particular resolution/quality, + and "super frame" to refer to all the representations (layer frames) + at a single instant in time. A super frame thus consists of one or + more layer frames, encoding different spatial layers. - Two modes of describing layer information are possible: "non-flexible - mode" and "flexible mode". An encoder can freely switch between the - two as appropriate. + Within a super frame, a layer frame with spatial layer ID equal to S, + where S > 0, can depend on a frame with a lower spatial layer ID. + This "inter-layer" dependency results in additional coding gain to + the traditional "inter-picture" dependency, where a frame depends on + previously coded frame in time. For simplicity, this payload format + assumes that, within a super frame if inter-layer dependency is used, + a spatial layer S frame can only depend on spatial layer S-1 frame + when S > 0. Additionally, if inter-picture dependency is used, + spatial layer S frame is assumed to only depend on prevously coded + spatial layer S frame. - In non-flexible mode, an SS message, which defines the layer - hierarchy, is sent in the beginning of the stream together with the - key frame. Each packet will have a picture id and reference indices, - which in conjunction with the SS and the RTP sequence number can be -Uberti, et al. Expires April 30, 2015 [Page 3] +Uberti, et al. Expires April 17, 2016 [Page 3] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 - used to determine if the packet is decodable or not. An SU message - can be sent by the sending client, or an MCU, to notify the receiver - about what subset of the SS it will actually be receiving. + TODO: Describe how simulcast can be supported? - In the flexible mode each packet contains 1-4 reference indices, + Given above simplifications for inter-layer and inter-picture + dependencies, a flag (the D bit described below) is used to indicate + whether a spatial layer S frame depends on spatial layer S-1 frame. + Given the D bit, a receiver only needs to additionally know the + inter-picture dependency structure for a given spatial layer frame in + order to determine its decodability. Two modes of describing the + inter-picture dependency structure are possible: "flexible mode" and + "non-flexible mode". An encoder can only switch between the two on + the very first packet of a key frame with temporal layer ID equal to + 0. + + In flexible mode, each packet can contain up to 3 reference indices, which identifies all frames referenced by the frame transmitted in - the current packet. This enables a receiver to identify if a frame - is decodable or not and helps it understand the layer structure so - that it can drop packets as it sees fit. Since this is signaled in - each packet it makes it possible to have more flexible layer + the current packet for inter-picture prediction. This (along with + the D bit) enables a receiver to identify if a frame is decodable or + not and helps it understand the temporal layer structure so that it + can drop packets as it sees fit. Since this is signaled in each + packet it makes it possible to have very flexible temporal layer hierarchies and patterns which are changing dynamically. + In non-flexible mode, the inter-picture dependency (the reference + indices) of a group of frames (GOF) MUST be pre-specified as part of + the scalability structure (SS) data. In this mode, each packet MUST + have an index to refer to one of the described frames in the GOF, + from which the frames referenced by the frame transmitted in the + current packet for inter-picture prediction can be identified. + + The SS data can also be used to specify the resolution of each + spatial layer present in the VP9 stream for both flexible and non- + flexible modes. + 4. Payload Format This section describes how the encoded VP9 bitstream is encapsulated @@ -200,30 +221,9 @@ Internet-Draft RTP Payload Format for VP9 October 2014 - - - - - - - - - - - - - - - - - - - - - -Uberti, et al. Expires April 30, 2015 [Page 4] +Uberti, et al. Expires April 17, 2016 [Page 4] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 The general RTP payload format for VP9 is depicted below. @@ -259,17 +259,17 @@ Internet-Draft RTP Payload Format for VP9 October 2014 Figure 1 - Marker bit (M): MUST be set for the final packet of each encoded - frame. This enables a decoder to finish decoding the frame, where - it otherwise may need to wait for the next packet to explicitly - know that the frame is complete. Note that, if spatial or quality - scalability is in use, more frames from the same picture may - follow; see the description of the E bit below. + Marker bit (M): MUST be set to 1 for the final packet of the highest + spatial layer frame (the final packet of the super frame), and 0 + otherwise. Unless spatial scalability is in use for this super + frame, this will have the same value as the E bit described below. + Note that a MANE MUST set this value to 1 for the target spatial + layer frame when shaping out higher spatial layers. - Timestamp: The RTP timestamp indicates the time when the frame was - sampled, at a clock rate of 90 kHz. If a picture is encoded with - multiple frames, all of the frames of the picture have the same - timestamp. + Timestamp: The RTP timestamp indicates the time when the input frame + was sampled, at a clock rate of 90 kHz. If the input frame is + encoded with multiple layer frames, all of the layer frames of the + super frame MUST have the same timestamp. Sequence number: The sequence numbers are monotonically increasing in order of the encoded bitstream. @@ -277,205 +277,269 @@ Internet-Draft RTP Payload Format for VP9 October 2014 -Uberti, et al. Expires April 30, 2015 [Page 5] +Uberti, et al. Expires April 17, 2016 [Page 5] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 - The remaining RTP header fields are used as specified in - [RFC3550]. + The remaining RTP header fields are used as specified in [RFC3550]. 4.2. VP9 Payload Description - The first octets after the RTP header are the VP9 payload descriptor, - with the following structure. - - 0 1 2 3 4 5 6 7 - +-+-+-+-+-+-+-+-+ - |I|L|F|B|E|V|U|-| (REQUIRED) - +-+-+-+-+-+-+-+-+ - I: |M|PICTURE ID | (RECOMMENDED) - +-+-+-+-+-+-+-+-+ - M: | EXTENDED PID | (RECOMMENDED) - +-+-+-+-+-+-+-+-+ - L: | T | S | Q | R | (CONDITIONALLY RECOMMENDED) - +-+-+-+-+-+-+-+-+ -\ - F: | PID |X| RS| RQ| (OPTIONAL) . - +-+-+-+-+-+-+-+-+ . - R times - X: | EXTENDED PID | (OPTIONAL) . - +-+-+-+-+-+-+-+-+ -/ - V: | SS | - | .. | - +-+-+-+-+-+-+-+-+ - U: | SU | - | .. | - +-+-+-+-+-+-+-+-+ + In flexible mode (with the F bit below set to 1), The first octets + after the RTP header are the VP9 payload descriptor, with the + following structure. + + 0 1 2 3 4 5 6 7 + +-+-+-+-+-+-+-+-+ + |I|P|L|F|B|E|V|-| (REQUIRED) + +-+-+-+-+-+-+-+-+ + I: |M| PICTURE ID | (REQUIRED) + +-+-+-+-+-+-+-+-+ + M: | EXTENDED PID | (RECOMMENDED) + +-+-+-+-+-+-+-+-+ + L: | T |U| S |D| (CONDITIONALLY RECOMMENDED) + +-+-+-+-+-+-+-+-+ -\ + P,F: | P_DIFF |N| (CONDITIONALLY REQUIRED) - up to 3 times + +-+-+-+-+-+-+-+-+ -/ + V: | SS | + | .. | + +-+-+-+-+-+-+-+-+ Figure 2 - I: PictureID present. When set to one, the OPTIONAL PictureID MUST + + + + + + + + + + + + + + + + + + + + + + + + +Uberti, et al. Expires April 17, 2016 [Page 6] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + + In non-flexible mode (with the F bit below set to 0), The first + octets after the RTP header are the VP9 payload descriptor, with the + following structure. + + 0 1 2 3 4 5 6 7 + +-+-+-+-+-+-+-+-+ + |I|P|L|F|B|E|V|-| (REQUIRED) + +-+-+-+-+-+-+-+-+ + I: |M| PICTURE ID | (RECOMMENDED) + +-+-+-+-+-+-+-+-+ + M: | EXTENDED PID | (RECOMMENDED) + +-+-+-+-+-+-+-+-+ + L: | T |U| S |D| (CONDITIONALLY RECOMMENDED) + +-+-+-+-+-+-+-+-+ + | TL0PICIDX | (CONDITIONALLY REQUIRED) + +-+-+-+-+-+-+-+-+ + V: | SS | + | .. | + +-+-+-+-+-+-+-+-+ + + + Figure 3 + + I: Picture ID (PID) present. When set to one, the OPTIONAL PID MUST be present after the mandatory first octet and specified as below. - Otherwise, PictureID MUST NOT be present. + Otherwise, PID MUST NOT be present. - L: Layer indices present. When set to one, the octets following the - first octet and the extended Picture ID (if present) are as - described by "Layer indices" below. + P: Inter-picture predicted layer frame. When set to zero, the layer + frame does not utilize inter-picture prediction. In this case, + up-switching to current spatial layer's frame is possible from + directly lower spatial layer frame. P SHOULD also be set to zero + when encoding a layer synchronization frame in response to an LRR + [I-D.lennox-avtext-lrr]. When P is set to zero, the T bit + (described below) MUST also be set to 0 (if present). - F: Reference indices present. When set to one, the octets following - the first octet and the extended Picture ID (if present) are as - described by "Reference indices" below. This MUST only be set if - L is also 1; if L is 0 then this MUST be set to zero and ignored - by receivers. + L: Layer indices present. When set to one, the one or two octets + following the mandatory first octet and the PID (if present) is as + described by "Layer indices" below. If the F bit (described + below) is set to 1 (indicating flexible mode), then only one octet + is present for the layer indices. Otherwise if the F bit is set + to 0 (indicating non-flexible mode), then two octets are present + for the layer indices. - B: Start of VP9 frame. MUST be set to 1 if the first payload octet - of the RTP packet is the beginning of a new VP9 frame, and MUST + F: Flexible mode. F set to one indicates flexible mode and if the P + bit is also set to one, then the octets following the mandatory + first octet, the PID, and layer indices (if present) are as + described by "Reference indices" below. This MUST only be set to + 1 if the I bit is also set to one; if the I bit is set to zero, -Uberti, et al. Expires April 30, 2015 [Page 6] +Uberti, et al. Expires April 17, 2016 [Page 7] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 - NOT be 1 otherwise. Note that this frame might not be the first - frame of the picture. + then this MUST also be set to zero and ignored by receivers. The + value of this F bit CAN ONLY CHANGE on the very first packet of a + key picture. This is a packet with the P bit equal to zero, S or + D bit (described below) equal to zero, and B bit (described below) + equal to 1. - E: End of picture. MUST be set to 1 for the final RTP packet of a - VP9 picture, and 0 otherwise. Unless spatial or quality - scalability is in use for this picture, this will have the same - value as the marker bit in the RTP header. + B: Start of a layer frame. MUST be set to 1 if the first payload + octet of the RTP packet is the beginning of a new VP9 layer frame, + and MUST NOT be 1 otherwise. Note that this layer frame might not + be the very first layer frame of a super frame. - V: Scalability Structure (SS) present. When set to one, the OPTIONAL - Scalability Structure MUST be present in the payload descriptor. - Otherwise, the Scalability Structure MUST NOT be present. + E: End of a layer frame. MUST be set to 1 for the final RTP packet + of a VP9 layer frame, and 0 otherwise. This enables a decoder to + finish decoding the layer frame, where it otherwise may need to + wait for the next packet to explicitly know that the layer frame + is complete. Note that, if spatial scalability is in use, more + layer frames from the same super frame may follow; see the + description of the M bit above. - U: Scalability Structure Update (SU) present. When set to one, the - OPTIONAL Scalability Structure Update MUST be present in the - payload descriptor. Otherwise, the Scalability Structure Update - MUST NOT be present. + V: Scalability structure (SS) data present. When set to one, the + OPTIONAL SS data MUST be present in the payload descriptor. + Otherwise, the SS data MUST NOT be present. -: Bit reserved for future use. MUST be set to zero and MUST be ignored by the receiver. - After the extension bit field follow the extension data fields that - are enabled. + The mandatory first octet is followed by the extension data fields + that are enabled: M: The most significant bit of the first octet is an extension flag. - The field MUST be present if the I bit is equal to one. If set - the PictureID field MUST contain 16 bits else it MUST contain 8 - bits including this MSB, see PictureID. - - PictureID: 8 or 16 bits including the M bit. This is a running - index of the frames. The field MUST be present if the I bit is - equal to one. The 7 following bits carry (parts of) the - PictureID. If the extension flag is one, the PictureID continues - in the next octet forming a 15 bit index, where the 8 bits in the - second octet are the least significant bits of the PictureID. If - the extension flag is zero, there is no extension, and the - PictureID is the 7 remaining bits of the first (and only) octet. - The sender may choose 7 or 15 bits index. The PictureID SHOULD - start on a random number, and MUST wrap after reaching the maximum - ID. The receiver MUST NOT assume that the number of bits in - PictureID stay the same through the session. - - Layer indices: This byte is optional, but recommended whenever - encoding with layers. T, S and Q are 2-bit indices for temporal, - spatial, and quality layers, respectively. S and Q start at zero - for each picture, and increment consecutively (with Q incrementing - before S). These can help MCUs measure bitrates per layer and can - help them make a quick decision on whether to relay a packet or - not. They can also help receivers determine what layers they are - - - -Uberti, et al. Expires April 30, 2015 [Page 7] - -Internet-Draft RTP Payload Format for VP9 October 2014 + The field MUST be present if the I bit is equal to one. If set, + the PID field MUST contain 15 bits; otherwise, it MUST contain 7 + bits. See PID below. + Picture ID (PID): Picture ID represented in 7 or 15 bits, depending + on the M bit. This is a running index of the pictures. The field + MUST be present if the I bit is equal to one. If M is set to + zero, 7 bits carry the PID; else if M is set to one, 15 bits carry + the PID. The sender may choose between 7 or 15 bits index. The + PID SHOULD start on a random number, and MUST wrap after reaching + the maximum ID. The receiver MUST NOT assume that the number of + bits in PID stay the same through the session. - currently decoding. If "F" is set in the initial octet, R is 2 - bits representing the number of reference fields this frame refers - to. R MAY be zero, indicating a keyframe. The layer indices - field will be followed by R reference indices. If "F" is not set, - R MUST be set to zero and ignored by receivers. + In the non-flexible mode (when the F bit is set to 0), this PID is + used as an index to the GOF specified in the SS data bleow. In + this mode, the PID of the key frame corresponds to the very first + specified frame in the GOF. Then subsequent PIDs are mapped to - Reference indices: These bytes are optional, but recommended when - encoding with layers in the flexible mode. They are also - recommended in the non-flexible mode when sending frames which are - out of sync with the pattern signaled with the SS, for instance - when encoding a layer synchronization frame in response to a LIR. - PID: The relative Picture ID referred to by this frame. I.e., - PID=3 on a packet containing the frame with Picture ID 112 - means that the frame refers back to the frame with picture ID - 109. This calculation is done modulo the size of the Picture - ID field, i.e. either 7 or 15 bits. For most layer structures - a 3-bit relative Picture ID will be enough; however, the X bit - can be used to refer to pictures with Picture IDs more than 7 - previously. - RS and RQ: The spatial and quality layer IDs of the frame - referred to by this frame, in the picture identified by the - relative Picture ID. - X: 1 if this layer index has an extended relative Picture ID. +Uberti, et al. Expires April 17, 2016 [Page 8] + +Internet-Draft RTP Payload Format for VP9 October 2015 - These 1-2 bytes are repeated R times, defined by the two R bits in - the layer indices field. -4.2.1. Scalability Structure (SS): + subsequently specified frames in the GOF (modulo N_G, specified in + the SS data below), respectively. - The Scalability Structure data describes the pattern of scalable - frames that will be used in a scalable stream. If the VP9 payload - header's "V" bit is set, the scalability structure (SS) is present in - the position indicated in Figure 2. + Layer indices: This information is optional but recommended whenever + encoding with layers. For both flexible and non-flexible modes, + one octet is used to specify a layer frame's temporal layer ID (T) + and spatial layer ID (S) as shown both in Figure 2 and Figure 3. + Additionally, a bit (U) is used to indcate that the current frame + is a "switching up point" frame. Another bit (D) is used to + indicate whether inter-layer prediction is used for the current + layer frame. - +-+-+-+-+-+-+-+-+ - V: | PATTERN LENGTH| - +-+-+-+-+-+-+-+-+ -\ - | T | S | Q | R | (OPTIONAL) . - +-+-+-+-+-+-+-+-+ -\ . - | PID |X| RS| RQ| (OPTIONAL) . . - PAT. LEN. times - +-+-+-+-+-+-+-+-+ . - R times . - X: | EXTENDED PID | (OPTIONAL) . . - +-+-+-+-+-+-+-+-+ -/ -/ + In the non-flexible mode (when the F bit is set to 0), another + octet is used to represent temporal layer 0 index (TL0PICIDX), as + depicted in Figure 3. The TL0PICIDX is present so that all + minimally required frames - the base temporal layer frames - can + be tracked. - Figure 3 + The T and S fields, indicate the temporal and spatial layers and + can help MCUs measure bitrates per layer and can help them make a + quick decision on whether to relay a packet or not. They can also + help receivers determine what layers they are currently decoding. + + T: The temporal layer ID of currenent frame. In the case of non- + flexible mode, if PID is mapped to a frame in a specified GOF, + then the value of T MUST match the corresponding T value of the + mapped frame in the GOF. + + U: Switching up point. If this bit is set to 1 for the current + frame with temporal layer ID equal to T, then "switch up" to a + higher frame rate is possible as subsequent higher temporal + layer frames will not depend on any frame before the current + frame (in coding time) with temporal layer ID greater than T. + S: The spatial layer ID of current frame. Note that frames with + spatial layer S > 0 may be dependent on decoded spatial layer + S-1 frame within the same super frame. + D: Inter-layer dependency used. MUST be set to one if current + spatial layer S frame depends on spatial layer S-1 frame of the + same super frame. MUST only be set to zero if current spatial + layer S frame does not depend on spatial layer S-1 frame of the + same super frame. For the base layer frame with S equal to 0, + this D bit MUST be set to zero. -Uberti, et al. Expires April 30, 2015 [Page 8] + TL0PICIDX: 8 bits temporal layer zero index. TL0PICIDX is only + present in the non-flexible mode (F = 0). This is a running + index for the temporal base layer frames, i.e., the frames with + + + +Uberti, et al. Expires April 17, 2016 [Page 9] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 + + T set to 0. If T is larger than 0, TL0PICIDX indicates which + temporal base layer frame the current frame depends on. + TL0PICIDX MUST be incremented when T is equal to 0. The index + SHOULD start on a random number, and MUST restart at 0 after + reaching the maximum number 255. - The scalability structure allows the structure of the VP9 stream to - be predeclared, rather than indicating it on the fly with every frame - as with the layer indices. + Reference indices: When P and F are both set to one, indicating a + non-key frame in flexible mode, then at least one reference index + has to be specified as below. Additional reference indices (total + of up to 3 reference indices are allowed) may be specified using + the N bit below. When either P or F is set to zero, then no + reference index is specified. - Its structure consists of a sequence of frames, encoded as with the - layer indices. It begins with PATTERN LENGTH, indicating the number - of frames in the pattern; it is then followed by that many instances - of data encoded using the same semantics as the layer indices. + P_DIFF: The reference index (in 7 bits) specified as the relative + PID from the current frame. For example, when P_DIFF=3 on a + packet containing the frame with PID 112 means that the frame + refers back to the frame with PID 109. This calculation is + done modulo the size of the PID field, i.e., either 7 or 15 + bits. + + N: 1 if there is additional P_DIFF following the current P_DIFF. + +4.2.1. Scalability Structure (SS): + + The scalability structure (SS) data describes the resolution of each + layer frame within a super frame as well as the inter-picture + dependencies for a group of frames (GOF). If the VP9 payload + descriptor's "V" bit is set, the SS data is present in the position + indicated in Figure 2 and Figure 3. - TODO: add frame resolution information. - In a scalable stream sent with a fixed pattern, the scalability - structure SHOULD be included in the first packet of every keyframe - picture, and also in the first packet of the first picture in which - the scalability structure changes. If a SS is included in a picture - with TID not equal to 0, it MUST also be repeated in the first packet - the first frame with a lower TID, until TID equals 0. - If PATTERN LENGTH is 0, it indicates that no fixed scalability - information is present going forward in the bitstream. An SS with a - PATTERN LENGTH of 0 allows a bitstream to be changed from non- - flexible to flexible mode. -4.2.2. Scalability Structure Update (SU): - TODO @@ -493,19 +557,81 @@ Internet-Draft RTP Payload Format for VP9 October 2014 +Uberti, et al. Expires April 17, 2016 [Page 10] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + + +-+-+-+-+-+-+-+-+ + V: | N_S |Y|G|-|-|-| + +-+-+-+-+-+-+-+-+ -\ + Y: | WIDTH | (OPTIONAL) . + + + . + | | (OPTIONAL) . + +-+-+-+-+-+-+-+-+ . - N_S + 1 times + | HEIGHT | (OPTIONAL) . + + + . + | | (OPTIONAL) . + +-+-+-+-+-+-+-+-+ -/ -\ + G: | N_G | (OPTIONAL) + +-+-+-+-+-+-+-+-+ -\ + N_G: | T |U| R |-|-| (OPTIONAL) . + +-+-+-+-+-+-+-+-+ -\ . - N_G times + | P_DIFF | (OPTIONAL) . - R times . + +-+-+-+-+-+-+-+-+ -/ -/ + + Figure 4 + + N_S: N_S + 1 indicates the number of spatial layers present in the + VP9 stream. + + Y: Each spatial layer's frame resolution present. When set to one, + the OPTIONAL WIDTH (2 octets) and HEIGHT (2 octets) MUST be + present for each layer frame. Otherwise, the resolution MUST NOT + be present. + + G: GOF description present flag. + + -: Bit reserved for future use. MUST be set to zero and MUST be + ignored by the receiver. + N_G: N_G indicates the number of frames in a GOF. If N_G is greater + than 0, then the SS data allows the inter-picture dependency + structure of the VP9 stream to be pre-declared, rather than + indicating it on the fly with every packet. If N_G is greater + than 0, then for N_G pictures in the GOF, each frame's temporal + layer ID (T), switch up point (U), and the R reference indices + (P_DIFFs) are specified. + The very first frame specified in the GOF MUST have T set to 0. + G set to 0 or N_G set to 0 indicates that either there is only one + temporal layer or no fixed inter-picture dependency information is + present going forward in the bitstream. -Uberti, et al. Expires April 30, 2015 [Page 9] +Uberti, et al. Expires April 17, 2016 [Page 11] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 + Note that for a given super frame, all layer frames follow the + same inter-picture dependency structure. However, the frame rate + of each spatial layer can be different from each other and this + can be controlled with the use of the D bit described above. The + specified dependency structure in the SS data MUST be for the + highest frame rate layer. + + In a scalable stream sent with a fixed pattern, the SS data SHOULD be + included in the first packet of every key frame. This is a packet + with P bit equal to zero, S or D bit equal to zero, and B bit equal + to 1. The SS data MUST only be changed on the frame that corresponds + to the very first frame specified in the previous SS data's GOF (if + the previous SS data's N_G was greater than 0). + 4.3. VP9 Payload Header TODO: need to describe VP9 payload header. @@ -540,6 +666,14 @@ Internet-Draft RTP Payload Format for VP9 October 2014 decoder knows is perfect -- or, it can be used as positive feedback information to acknowledge correct decoding of certain reference pictures. The positive feedback method is useful for VP9 used as + + + +Uberti, et al. Expires April 17, 2016 [Page 12] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + unicast. The use of RPSI for VP9 is preferably combined with a special update pattern of the codec's two special reference frames -- the golden frame and the altref frame -- in which they are updated in @@ -550,18 +684,6 @@ Internet-Draft RTP Payload Format for VP9 October 2014 message body (i.e., the "native RPSI bit string" in [RFC4585]) is simply the PictureID of the received frame. - - - - - - - -Uberti, et al. Expires April 30, 2015 [Page 10] - -Internet-Draft RTP Payload Format for VP9 October 2014 - - 5.2. SLI TODO: Update to indicate which frame within the picture. @@ -578,7 +700,7 @@ Internet-Draft RTP Payload Format for VP9 October 2014 | First | Number | PictureID | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - Figure 4 + Figure 5 Here, First is the macroblock address (in scan order) of the first lost block and Number is the number of lost blocks. PictureID is the @@ -596,6 +718,18 @@ Internet-Draft RTP Payload Format for VP9 October 2014 can encode the next frame with reference to that established reference. + + + + + + + +Uberti, et al. Expires April 17, 2016 [Page 13] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + 5.3. Example TODO: this example is copied from the VP8 payload format @@ -610,14 +744,6 @@ Internet-Draft RTP Payload Format for VP9 October 2014 acknowledged, the same rule applies when updating the altref frame. +-------+-------------------+-------------------------+-------------+ - - - -Uberti, et al. Expires April 30, 2015 [Page 11] - -Internet-Draft RTP Payload Format for VP9 October 2014 - - | Event | Sender | Receiver | Established | | | | | reference | +-------+-------------------+-------------------------+-------------+ @@ -652,6 +778,14 @@ Internet-Draft RTP Payload Format for VP9 October 2014 | | | frame (decoder state | | | | | restored) | | | | | | | + + + +Uberti, et al. Expires April 17, 2016 [Page 14] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + | ... | (sending regular | | | | | frames) | | | | | | | | @@ -666,14 +800,6 @@ Internet-Draft RTP Payload Format for VP9 October 2014 | 1202 | Receive RPSI(200) | | altref | | | | | | | ... | (sending regular | | | - - - -Uberti, et al. Expires April 30, 2015 [Page 12] - -Internet-Draft RTP Payload Format for VP9 October 2014 - - | | frames) | | | | | | | | | 1300 | Send golden frame | | | @@ -707,54 +833,20 @@ Internet-Draft RTP Payload Format for VP9 October 2014 restrictions as specified in [RFC4585], as long as the picture is corrupted. -6. Layer Intra Request - - Editor's Note: The message described in this section is applicable to - other codecs beyond just VP9. In the future it will be likely be - split out into another document. - - TODO: details of how this is encoded in RTCP. - - A synchronization frame can be requested by sending a LIR, which is - an RTCP feedback message asking the encoder to encode a frame which - makes it possible to upgrade to a higher layer. The LIR message - contains two tuples, {T1,S1,Q1} and {T2,S2,Q2}, where the first tuple - is the currently highest layer the decoder can decode, while the - second tuple is the layer the decoder wants to upgrade to. -Uberti, et al. Expires April 30, 2015 [Page 13] +Uberti, et al. Expires April 17, 2016 [Page 15] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 - Identification of an upgrade frame can be derived from the reference - IDs of each frame by backtracking the dependency chain until reaching - a point where only decodable frames are being referenced. Therefore - it's recommended both for both the flexible and the non-flexible mode - that, when upgrade frames are being encoded in response to a LIR, - those packets should contain layer indices and the reference fields - so that the decoder or an MCU can make this derivation. - - Example: - - LIR {1,1,0}, {1,2,1} is sent by an MCU when it is currently relaying - {1,1,0} to a receiver and which wants to upgrade to {1,2,1}. In - response the encoder should encode the next frames in layers {1,1,1} - and {1,2,1} by only referring to frames in {1,1,0}, {1,0,0} or - {0,0,0}. - - In the non-flexible mode, periodic upgrade frames can be defined by - the layer structure of the SS, thus periodic upgrade frames can be - automatically identified by the picture ID. - -7. Payload Format Parameters +6. Payload Format Parameters This payload format has two required parameters. -7.1. Media Type Definition +6.1. Media Type Definition This registration is done using the template defined in [RFC6838] and following [RFC4855]. @@ -778,25 +870,15 @@ Internet-Draft RTP Payload Format for VP9 October 2014 The decoder is capable of decoding this frame size as long as the width and height of the frame in macroblocks are less than - - - -Uberti, et al. Expires April 30, 2015 [Page 14] - -Internet-Draft RTP Payload Format for VP9 October 2014 - - int(sqrt(max-fs * 8)) - for instance, a max-fs of 1200 (capable of supporting 640x480 resolution) will support widths and heights up to 1552 pixels (97 macroblocks). - Optional parameters: none - Encoding considerations: This media type is framed in RTP and contains binary data; see Section 4.8 of [RFC6838]. - Security considerations: See Section 8 of RFC xxxx. + Security considerations: See Section 7 of RFC xxxx. [RFC Editor: Upon publication as an RFC, please replace "XXXX" with the number assigned to this document and remove this note.] @@ -808,8 +890,18 @@ Internet-Draft RTP Payload Format for VP9 October 2014 with the number assigned to this document and remove this note.] Applications which use this media type: + + + +Uberti, et al. Expires April 17, 2016 [Page 16] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + For example: Video over IP, video conferencing. + Fragment identifier considerations: N/A. + Additional information: None. Person & email address to contact for further information: @@ -826,23 +918,11 @@ Internet-Draft RTP Payload Format for VP9 October 2014 Change controller: IETF Payload Working Group delegated from the IESG. -7.2. SDP Parameters +6.2. SDP Parameters The receiver MUST ignore any fmtp parameter unspecified in this memo. - - - - - - - -Uberti, et al. Expires April 30, 2015 [Page 15] - -Internet-Draft RTP Payload Format for VP9 October 2014 - - -7.2.1. Mapping of Media Subtype Parameters to SDP +6.2.1. Mapping of Media Subtype Parameters to SDP The media type video/VP9 string is mapped to fields in the Session Description Protocol (SDP) [RFC4566] as follows: @@ -855,23 +935,32 @@ Internet-Draft RTP Payload Format for VP9 October 2014 o The clock rate in the "a=rtpmap" line MUST be 90000. o The parameters "max-fs", and "max-fr", MUST be included in the - "a=fmtp" line of SDP. These parameters are expressed as a media - subtype string, in the form of a semicolon separated list of + "a=fmtp" line of SDP if SDP is used to declare receiver + capabilities. These parameters are expressed as a media subtype + string, in the form of a semicolon separated list of parameter=value pairs. -7.2.1.1. Example +6.2.1.1. Example An example of media representation in SDP is as follows: m=video 49170 RTP/AVPF 98 a=rtpmap:98 VP9/90000 + + + +Uberti, et al. Expires April 17, 2016 [Page 17] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + a=fmtp:98 max-fr=30; max-fs=3600; -7.2.2. Offer/Answer Considerations +6.2.2. Offer/Answer Considerations TODO: Update this for VP9 -8. Security Considerations +7. Security Considerations RTP packets using the payload format defined in this specification are subject to the security considerations discussed in the RTP @@ -890,21 +979,13 @@ Internet-Draft RTP Payload Format for VP9 October 2014 may vary. It is dependent on the application, the transport, and the signaling protocol employed. Therefore a single mechanism is not sufficient, although if suitable the usage of SRTP [RFC3711] is - - - -Uberti, et al. Expires April 30, 2015 [Page 16] - -Internet-Draft RTP Payload Format for VP9 October 2014 - - recommended. This RTP payload format and its media decoder do not exhibit any significant non-uniformity in the receiver-side computational complexity for packet processing, and thus are unlikely to pose a denial-of-service threat due to the receipt of pathological data. Nor does the RTP payload format contain any active content. -9. Congestion Control +8. Congestion Control Congestion control for RTP SHALL be used in accordance with RFC 3550 [RFC3550], and with any applicable RTP profile; e.g., RFC 3551 @@ -917,54 +998,85 @@ Internet-Draft RTP Payload Format for VP9 October 2014 reference frames cannot be done if the stream is encrypted (because the non-reference marker is encrypted). -10. IANA Considerations + + + + + + + +Uberti, et al. Expires April 17, 2016 [Page 18] + +Internet-Draft RTP Payload Format for VP9 October 2015 + + +9. IANA Considerations The IANA is requested to register the following values: - - Media type registration as described in Section 7.1. + - Media type registration as described in Section 6.1. -11. References +10. References + +10.1. Normative References [I-D.grange-vp9-bitstream] Grange, A. and H. Alvestrand, "A VP9 Bitstream Overview", draft-grange-vp9-bitstream-00 (work in progress), February 2013. + [I-D.lennox-avtext-lrr] + Lennox, J., Hong, D., Uberti, J., Holmer, S., and M. + Flodman, "The Layer Refresh Request (LRR) RTCP Feedback + Message", draft-lennox-avtext-lrr-00 (work in progress), + March 2015. + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate - Requirement Levels", BCP 14, RFC 2119, March 1997. + Requirement Levels", BCP 14, RFC 2119, DOI 10.17487/ + RFC2119, March 1997, + . [RFC3550] Schulzrinne, H., Casner, S., Frederick, R., and V. Jacobson, "RTP: A Transport Protocol for Real-Time - Applications", STD 64, RFC 3550, July 2003. + Applications", STD 64, RFC 3550, DOI 10.17487/RFC3550, + July 2003, . - [RFC3551] Schulzrinne, H. and S. Casner, "RTP Profile for Audio and - Video Conferences with Minimal Control", STD 65, RFC 3551, - July 2003. + [RFC4566] Handley, M., Jacobson, V., and C. Perkins, "SDP: Session + Description Protocol", RFC 4566, DOI 10.17487/RFC4566, + July 2006, . - [RFC3711] Baugher, M., McGrew, D., Naslund, M., Carrara, E., and K. - Norrman, "The Secure Real-time Transport Protocol (SRTP)", - RFC 3711, March 2004. + [RFC4585] Ott, J., Wenger, S., Sato, N., Burmeister, C., and J. Rey, + "Extended RTP Profile for Real-time Transport Control + Protocol (RTCP)-Based Feedback (RTP/AVPF)", RFC 4585, DOI + 10.17487/RFC4585, July 2006, + . - [RFC4566] Handley, M., Jacobson, V., and C. Perkins, "SDP: Session - Description Protocol", RFC 4566, July 2006. + [RFC4855] Casner, S., "Media Type Registration of RTP Payload + Formats", RFC 4855, DOI 10.17487/RFC4855, February 2007, + . + + [RFC6838] Freed, N., Klensin, J., and T. Hansen, "Media Type + Specifications and Registration Procedures", BCP 13, RFC + 6838, DOI 10.17487/RFC6838, January 2013, + . -Uberti, et al. Expires April 30, 2015 [Page 17] +Uberti, et al. Expires April 17, 2016 [Page 19] -Internet-Draft RTP Payload Format for VP9 October 2014 +Internet-Draft RTP Payload Format for VP9 October 2015 - [RFC4585] Ott, J., Wenger, S., Sato, N., Burmeister, C., and J. Rey, - "Extended RTP Profile for Real-time Transport Control - Protocol (RTCP)-Based Feedback (RTP/AVPF)", RFC 4585, July - 2006. +10.2. Informative References - [RFC4855] Casner, S., "Media Type Registration of RTP Payload - Formats", RFC 4855, February 2007. + [RFC3551] Schulzrinne, H. and S. Casner, "RTP Profile for Audio and + Video Conferences with Minimal Control", STD 65, RFC 3551, + DOI 10.17487/RFC3551, July 2003, + . - [RFC6838] Freed, N., Klensin, J., and T. Hansen, "Media Type - Specifications and Registration Procedures", BCP 13, RFC - 6838, January 2013. + [RFC3711] Baugher, M., McGrew, D., Naslund, M., Carrara, E., and K. + Norrman, "The Secure Real-time Transport Protocol (SRTP)", + RFC 3711, DOI 10.17487/RFC3711, March 2004, + . Authors' Addresses @@ -979,10 +1091,35 @@ Authors' Addresses Stefan Holmer Google, Inc. + Kungsbron 2 + Stockholm 111 22 + Sweden + + Email: holmer@google.com Magnus Flodman Google, Inc. + Kungsbron 2 + Stockholm 111 22 + Sweden + + Email: mflodman@google.com + + + + + + + + + + + + +Uberti, et al. Expires April 17, 2016 [Page 20] + +Internet-Draft RTP Payload Format for VP9 October 2015 Jonathan Lennox @@ -993,6 +1130,37 @@ Authors' Addresses US Email: jonathan@vidyo.com + + + Danny Hong + Vidyo, Inc. + 433 Hackensack Avenue + Seventh Floor + Hackensack, NJ 07601 + US + + Email: danny@vidyo.com + + + + + + + + + + + + + + + + + + + + + @@ -1005,4 +1173,4 @@ Authors' Addresses -Uberti, et al. Expires April 30, 2015 [Page 18] +Uberti, et al. Expires April 17, 2016 [Page 21] diff --git a/vp9/draft-uberti-payload-vp9-00.xml b/vp9/draft-ietf-payload-vp9-00.xml similarity index 54% rename from vp9/draft-uberti-payload-vp9-00.xml rename to vp9/draft-ietf-payload-vp9-00.xml index 206e4e3..f8eab1b 100644 --- a/vp9/draft-uberti-payload-vp9-00.xml +++ b/vp9/draft-ietf-payload-vp9-00.xml @@ -10,9 +10,10 @@ + ]> - + @@ -58,7 +59,7 @@ Google, Inc.
- Kungsbron 2 + Kungsbron 2 111 22 @@ -66,6 +67,7 @@ Sweden + holmer@google.com
@@ -73,7 +75,7 @@ Google, Inc.
- Kungsbron 2 + Kungsbron 2 111 22 @@ -81,6 +83,7 @@ Sweden + mflodman@google.com
@@ -106,6 +109,28 @@ + + Vidyo, Inc. + +
+ + 433 Hackensack Avenue + + Seventh Floor + + Hackensack + + NJ + + 07601 + + US + + + danny@vidyo.com +
+
+ RAI @@ -137,7 +162,7 @@ target="I-D.grange-vp9-bitstream"/>. The format described in this document can be used both in peer-to-peer and video conferencing applications. - TODO: VP9 description. Please see TODO: VP9 description. Please see .
@@ -153,66 +178,84 @@ The VP9 codec can maintain up to eight reference frames, of which up to three can be referenced or updated by any new frame. - VP9 also allows a reference frame to be resampled and used as a - reference for another frame of a different resolution. This - allows internal resolution changes without requiring the use of - keyframes. - - These features together enable an encoder to - implement various forms of coarse-grained scalability, - including temporal, spatial, and quality scalability modes, as - well as combinations of these, without the need for explicit - spatially scalabile encoding modes. - - This payload format specification defines how such - scalability modes can be encoded and communicated. In this - payload, three separate types of layers are defined: temporal, - spatial, and quality. - - Temporal layers define different frame rates of video; - spatial and quality layers define different, dependent - representations of a single picture. Spatial layers allow - a picture to be encoded at different resolutions, whereas - quality layers allow a picture to be encoded at the same - resolution but at different bitrates (and thus with different - amounts of coding error). - - Layers are designed (and MUST be encoded) such that if - any layer, and all higher layers, are removed from the bitstream - along any of the three dimensions, the remaining bitstream is - still correctly decodable. - - For terminology, this document uses the term "frame" to refer - to a single encoded VP9 image, and "picture" to refer to all the - representations of frames at a single instant in time. A - picture thus can consist of multiple frames, encoding different - spatial and/or quality layers. - - [Editor's Note: Are separate spatial and quality layers - necessary and useful? We could simplify by only defining a - single sequence of frames within a picture. - - Two modes of describing layer information are possible: - "non-flexible mode" and "flexible mode". An encoder can - freely switch between the two as appropriate. - - In non-flexible mode, an SS message, which defines the - layer hierarchy, is sent in the beginning of the stream - together with the key frame. Each packet will have a picture - id and reference indices, which in conjunction with the SS and the - RTP sequence number can be used to determine if the packet is - decodable or not. An SU message can be sent by the sending - client, or an MCU, to notify the receiver about what subset of - the SS it will actually be receiving. - - In the flexible mode each packet contains 1-4 reference - indices, which identifies all frames referenced by the frame - transmitted in the current packet. This enables a receiver to - identify if a frame is decodable or not and helps it - understand the layer structure so that it can drop packets as - it sees fit. Since this is signaled in each packet it makes it - possible to have more flexible layer hierarchies and patterns - which are changing dynamically. + VP9 also allows a reference frame to be resampled and used as a + reference for another frame of a different resolution. This + allows internal resolution changes without requiring the use of + key frames. + + These features together enable an encoder to + implement various forms of coarse-grained scalability, + including temporal, spatial and quality scalability modes, as + well as combinations of these, without the need for explicit + scalable coding tools. + + Temporal layers define different frame rates of video; + spatial and quality layers define different and possibly dependent + representations of a single input frame. Spatial layers allow + a frame to be encoded at different resolutions, whereas + quality layers allow a frame to be encoded at the same + resolution but at different qualities (and thus with different + amounts of coding error). VP9 supports quality layers as + spatial layers without any resolution changes; hereinafter, + the term "spatial layer" is used to represent both spatial and + quality layers. + + This payload format specification defines how such + temporal and spatial scalability layers can be described and + communicated. + + Layers are designed (and MUST be encoded) such that if + any layer, and all higher layers, are removed from the bitstream + along any of the two dimensions, the remaining bitstream is + still correctly decodable. + + For terminology, this document uses the term "layer frame" to refer + to a single encoded VP9 frame for a particular resolution/quality, and + "super frame" to refer to all the representations (layer frames) at a single + instant in time. A super frame thus consists of one or more layer frames, + encoding different spatial layers. + + Within a super frame, a layer frame with spatial layer ID equal to S, + where S > 0, can depend on a frame with a lower spatial layer ID. This + "inter-layer" dependency results in additional coding gain to the + traditional "inter-picture" dependency, where a frame depends on previously + coded frame in time. For simplicity, this payload format assumes that, + within a super frame if inter-layer dependency is used, a spatial layer S frame + can only depend on spatial layer S-1 frame when S > 0. Additionally, if + inter-picture dependency is used, spatial layer S frame is assumed to only + depend on prevously coded spatial layer S frame. + + TODO: Describe how simulcast can be supported? + + Given above simplifications for inter-layer and inter-picture + dependencies, a flag (the D bit described below) is used to indicate whether a + spatial layer S frame depends on spatial layer S-1 frame. Given the D bit, a receiver + only needs to additionally know the inter-picture dependency structure for a given + spatial layer frame in order to determine its decodability. Two modes + of describing the inter-picture dependency structure are possible: + "flexible mode" and "non-flexible mode". An encoder can only switch + between the two on the very first packet of a key frame with temporal + layer ID equal to 0. + + In flexible mode, each packet can contain up to 3 reference + indices, which identifies all frames referenced by the frame + transmitted in the current packet for inter-picture prediction. + This (along with the D bit) enables a receiver to identify if a frame + is decodable or not and helps it understand the temporal layer structure + so that it can drop packets as it sees fit. Since this is signaled in + each packet it makes it possible to have very flexible temporal layer + hierarchies and patterns which are changing dynamically. + + In non-flexible mode, the inter-picture dependency (the reference + indices) of a group of frames (GOF) MUST be pre-specified as part of the + scalability structure (SS) data. In this mode, each + packet MUST have an index to refer to one of the described frames + in the GOF, from which the frames referenced by the frame transmitted in the current packet + for inter-picture prediction can be identified. + + The SS data can also be used to specify the resolution of each + spatial layer present in the VP9 stream for both flexible and non-flexible modes. +
@@ -260,210 +303,278 @@ - MUST be set for the final packet - of each encoded frame. This enables a decoder to finish decoding the - frame, where it otherwise may need to wait for the next packet - to explicitly know that the frame is complete. Note that, - if spatial or quality scalability is in use, more frames from the - same picture may follow; see the description of the E bit below. + MUST be set to 1 for the final packet + of the highest spatial layer frame (the final packet of the super frame), + and 0 otherwise. Unless spatial scalability is in use for this super frame, + this will have the same value as the E bit described below. Note that a + MANE MUST set this value to 1 for the target spatial layer frame + when shaping out higher spatial layers. The RTP timestamp indicates the time when - the frame was sampled, at a clock rate of 90 kHz. If a - picture is encoded with multiple frames, all of the - frames of the picture have the same timestamp. + the input frame was sampled, at a clock rate of 90 kHz. If the + input frame is encoded with multiple layer frames, all of the + layer frames of the super frame MUST have the same timestamp. The sequence numbers are monotonically increasing in order of the encoded bitstream. - - The remaining RTP header fields are used as specified in . + The remaining RTP header fields are used as specified in .
- The first octets after the RTP header are the VP9 payload - descriptor, with the following structure. + In flexible mode (with the F bit below set to 1), The first octets + after the RTP header are the VP9 payload descriptor, with the following + structure. +
+ +
+ In non-flexible mode (with the F bit below set to 0), The first octets + after the RTP header are the VP9 payload descriptor, with the following + structure. + +
- PictureID present. When set to one, the OPTIONAL - PictureID MUST be present after the mandatory first octet and - specified as below. Otherwise, PictureID MUST NOT be present. - Layer indices present. When set to one, - the octets following the first octet and the extended - Picture ID (if present) are as described by "Layer - indices" below. - Reference indices present. When set to one, - the octets following the first octet and the extended - Picture ID (if present) are as described by "Reference - indices" below. This MUST only be set if L is also - 1; if L is 0 then this MUST be set to zero and - ignored by receivers. - Start of VP9 frame. MUST be set to 1 if - the first payload octet of the RTP packet is the beginning of a - new VP9 frame, and MUST NOT be 1 otherwise. Note that this - frame might not be the first frame of the picture. - - End of picture. MUST be set to 1 for the final - RTP packet of a VP9 picture, and 0 otherwise. Unless - spatial or quality scalability is in use for this picture, this will have the same - value as the marker bit in the RTP header. - - Scalability Structure (SS) present. When set - to one, the OPTIONAL Scalability Structure MUST be - present in the payload descriptor. Otherwise, the - Scalability Structure MUST NOT be present. - - Scalability Structure Update (SU) present. When set - to one, the OPTIONAL Scalability Structure Update MUST be - present in the payload descriptor. Otherwise, the - Scalability Structure Update MUST NOT be present. - - Bit reserved for future use. MUST be set to - zero and MUST be ignored by the receiver. + Picture ID (PID) present. When set to one, the + OPTIONAL PID MUST be present after the mandatory first octet and + specified as below. Otherwise, PID MUST NOT be present. + + Inter-picture predicted layer frame. When set to zero, the + layer frame does not utilize inter-picture prediction. In this case, + up-switching to current spatial layer's frame is possible from directly + lower spatial layer frame. P SHOULD also be set to zero when + encoding a layer synchronization frame in response to an LRR. + When P is set to zero, the T bit (described below) MUST also be set to 0 (if present). + + Layer indices present. When set to one, + the one or two octets following the mandatory first octet and the PID + (if present) is as described by "Layer indices" below. If the F bit (described below) + is set to 1 (indicating flexible mode), then only one octet is present for the + layer indices. Otherwise if the F bit is set to 0 (indicating non-flexible mode), + then two octets are present for the layer indices. + + Flexible mode. F set to one indicates + flexible mode and if the P bit is also set to one, then the octets following + the mandatory first octet, the PID, and layer indices (if present) are + as described by "Reference indices" below. This MUST only be set to 1 if the I + bit is also set to one; if the I bit is set to zero, then this MUST also be + set to zero and ignored by receivers. The value of this F bit CAN ONLY CHANGE + on the very first packet of a key picture. This is a packet with the P bit + equal to zero, S or D bit (described below) equal to zero, and B bit (described below) + equal to 1. + + Start of a layer frame. MUST be set to 1 if + the first payload octet of the RTP packet is the beginning of a + new VP9 layer frame, and MUST NOT be 1 otherwise. Note that this + layer frame might not be the very first layer frame of a super frame. + + End of a layer frame. MUST be set to 1 for the final + RTP packet of a VP9 layer frame, and 0 otherwise. This enables a + decoder to finish decoding the layer frame, where it otherwise may need to + wait for the next packet to explicitly know that the layer frame is complete. + Note that, if spatial scalability is in use, more layer frames from the + same super frame may follow; see the description of the M bit above. + + Scalability structure (SS) data present. When set + to one, the OPTIONAL SS data MUST be present in the payload descriptor. + Otherwise, the SS data MUST NOT be present. + + Bit reserved for future use. MUST be set to + zero and MUST be ignored by the receiver. + + + The mandatory first octet is followed by the extension data fields that + are enabled: + The most significant bit of the first octet is an + extension flag. The field MUST be present if the I bit is equal to + one. If set, the PID field MUST contain 15 bits; otherwise, it MUST + contain 7 bits. See PID below. + + Picture ID represented in 7 or 15 bits, + depending on the M bit. This is a running index of the pictures. The + field MUST be present if the I bit is equal to one. If M is set to zero, + 7 bits carry the PID; else if M is set to one, 15 bits carry the PID. + The sender may choose between 7 or 15 bits index. The PID SHOULD start on a + random number, and MUST wrap after reaching the maximum ID. The receiver + MUST NOT assume that the number of bits in PID stay the same through the + session. + + In the non-flexible mode (when the F bit is set to 0), this PID is used + as an index to the GOF specified in the SS data bleow. In this mode, the + PID of the key frame corresponds to the very first specified frame in the + GOF. Then subsequent PIDs are mapped to subsequently specified frames in + the GOF (modulo N_G, specified in the SS data below), respectively. + + This information is optional but recommended + whenever encoding with layers. For both flexible and non-flexible modes, + one octet is used to specify a layer frame's temporal layer ID (T) and spatial layer ID (S) + as shown both in and . + Additionally, a bit (U) is used to indcate that the current frame is a + "switching up point" frame. Another bit (D) is used to indicate whether inter-layer + prediction is used for the current layer frame. + + In the non-flexible mode (when the F bit is set to 0), another octet is used + to represent temporal layer 0 index (TL0PICIDX), as depicted in . + The TL0PICIDX is present so that all minimally required frames - the base temporal layer frames - can be tracked. + + The T and S fields, indicate the temporal and spatial layers and can help MCUs measure bitrates + per layer and can help them make a quick decision on whether to relay a packet + or not. They can also help receivers determine what layers they are currently + decoding. + + + The temporal layer ID of currenent frame. In the case of non-flexible mode, + if PID is mapped to a frame in a specified GOF, then + the value of T MUST match the corresponding T value of the mapped frame in the GOF. + + Switching up point. If this bit is set to 1 for the current frame with temporal + layer ID equal to T, then "switch up" to a higher frame rate is possible as subsequent higher temporal + layer frames will not depend on any frame before the current frame (in coding time) with temporal layer + ID greater than T. + + The spatial layer ID of current frame. Note that frames with spatial layer S > 0 + may be dependent on decoded spatial layer S-1 frame within the same super frame. + + Inter-layer dependency used. MUST be set to one if current spatial layer S frame + depends on spatial layer S-1 frame of the same super frame. MUST only be set to zero if current spatial + layer S frame does not depend on spatial layer S-1 frame of the same super frame. For the base layer frame + with S equal to 0, this D bit MUST be set to zero. + + 8 bits temporal layer zero index. TL0PICIDX is only present + in the non-flexible mode (F = 0). This is a running index for the temporal + base layer frames, i.e., the frames with T set to 0. If T is larger than 0, + TL0PICIDX indicates which temporal base layer frame the current frame depends on. TL0PICIDX MUST be + incremented when T is equal to 0. The index SHOULD start on a random number, and MUST restart + at 0 after reaching the maximum number 255. - After the extension bit field follow the extension data fields that - are enabled. - The most significant bit of the first octet is an - extension flag. The field MUST be present if the I bit is equal to - one. If set the PictureID field MUST contain 16 bits else it MUST - contain 8 bits including this MSB, see PictureID. - - 8 or 16 bits - including the M bit. This is a running index of the frames. The - field MUST be present if the I bit is equal to one. The 7 - following bits carry (parts of) the PictureID. If the extension - flag is one, the PictureID continues in the next octet forming a - 15 bit index, where the 8 bits in the second octet are the least - significant bits of the PictureID. If the extension flag is zero, - there is no extension, and the PictureID is the 7 remaining bits - of the first (and only) octet. The sender may choose 7 or 15 bits - index. The PictureID SHOULD start on a random number, and MUST - wrap after reaching the maximum ID. The receiver MUST NOT assume - that the number of bits in PictureID stay the same through the - session. - - This byte is optional, but - recommended whenever encoding with layers. T, S and Q - are 2-bit indices for temporal, spatial, and quality - layers, respectively. S and Q start at zero for each - picture, and increment consecutively (with Q - incrementing before S). These can help MCUs measure bitrates per - layer and can help them make a quick decision on whether - to relay a packet or not. They can also help receivers - determine what layers they are currently decoding. If "F" is set in the initial octet, R - is 2 bits representing the number of reference fields this frame - refers to. R MAY be zero, indicating a keyframe. The layer indices field will be followed by R - reference indices. If "F" is not set, R MUST be set to zero and ignored by receivers. - - These bytes are optional, but recommended when encoding with layers in - the flexible mode. They are also recommended in the - non-flexible mode when sending frames which are out of sync - with the pattern signaled with the SS, for instance when - encoding a layer synchronization frame in response to a LIR. - - - The relative Picture ID referred to by this frame. I.e., PID=3 - on a packet containing the frame with Picture ID 112 means - that the frame refers back to the frame with picture ID - 109. This calculation is done modulo the size of the Picture - ID field, i.e. either 7 or 15 bits. For most layer - structures a 3-bit relative Picture ID will be enough; - however, the X bit can be used to refer to pictures with - Picture IDs more than 7 previously. - - The spatial and quality layer - IDs of the frame referred to by this frame, in the picture - identified by the relative Picture ID. - - 1 if this layer index has an extended relative Picture ID. - - These 1-2 bytes are repeated R times, defined by the two R bits in the - layer indices field. - - -
- The Scalability Structure data describes - the pattern of scalable frames that will be used in a scalable - stream. If the VP9 payload header's "V" bit is set, - the scalability structure (SS) is present in the position - indicated in . -
- When P and F are both set to one, indicating a non-key frame in + flexible mode, then at least + one reference index has to be specified as below. Additional reference indices (total of up to + 3 reference indices are allowed) may be specified using the N bit below. When either P or F is + set to zero, then no reference index is specified. + + The reference index (in 7 bits) specified as the + relative PID from the current frame. For example, when P_DIFF=3 + on a packet containing the frame with PID 112 means + that the frame refers back to the frame with PID + 109. This calculation is done modulo the size of the PID field, + i.e., either 7 or 15 bits. + 1 if there is additional P_DIFF following the current P_DIFF. + + + +
+ The scalability structure (SS) data describes the resolution of + each layer frame within a super frame as well as the inter-picture dependencies + for a group of frames (GOF). If the VP9 payload descriptor's "V" + bit is set, the SS data is present in the position indicated in + and . +
+
- The scalability structure allows the structure of the - VP9 stream to be predeclared, rather than indicating it on - the fly with every frame as with the layer indices. - - Its structure consists of a sequence of frames, encoded - as with the layer indices. It begins with PATTERN LENGTH, - indicating the number of frames in the pattern; it is then - followed by that many instances of data encoded using the - same semantics as the layer indices. - - TODO: add frame resolution information. - - In a scalable - stream sent with a fixed pattern, the scalability - structure SHOULD be included in the first packet of every - keyframe picture, and also in the first packet of the - first picture in which the scalability structure changes. - If a SS is included in a picture with TID not equal to 0, - it MUST also be repeated in the first packet the first - frame with a lower TID, until TID equals 0. - - If PATTERN LENGTH is 0, it indicates that no fixed - scalability information is present going forward in the - bitstream. An SS with a PATTERN LENGTH of 0 allows a - bitstream to be changed from non-flexible to flexible - mode. - -
- -
- TODO - - - -
+ + N_S + 1 indicates the number of spatial + layers present in the VP9 stream. + + Each spatial layer's frame resolution present. + When set to one, the OPTIONAL WIDTH (2 octets) and HEIGHT + (2 octets) MUST be present for each layer frame. Otherwise, the + resolution MUST NOT be present. + + GOF description present flag. + + Bit reserved for future use. MUST be set to + zero and MUST be ignored by the receiver. + + N_G indicates the number of frames in a GOF. + If N_G is greater than 0, then the SS data allows + the inter-picture dependency structure of the VP9 stream to + be pre-declared, rather than indicating it on the fly with + every packet. If N_G is greater than 0, then for N_G + pictures in the GOF, each frame's temporal layer ID (T), switch up point (U), + and the R reference indices (P_DIFFs) are specified. + + The very first frame specified in the GOF MUST have T set to 0. + + G set to 0 or N_G set to 0 indicates that either there is only one temporal + layer or no fixed inter-picture dependency information is present + going forward in the bitstream. + + Note that for a given super frame, all layer frames follow the + same inter-picture dependency structure. However, the frame rate + of each spatial layer can be different from each other and this can + be controlled with the use of the D bit described above. The + specified dependency structure in the SS data MUST be for the highest + frame rate layer. + + + In a scalable stream sent with a fixed pattern, the SS data + SHOULD be included in the first packet of every key frame. This is a packet + with P bit equal to zero, S or D bit equal to zero, and B bit equal to 1. + The SS data MUST only be changed on the frame that corresponds to the very + first frame specified in the previous SS data's GOF + (if the previous SS data's N_G was greater than 0).
+
TODO: need to describe VP9 payload header. @@ -777,42 +888,6 @@ X: | EXTENDED PID | (OPTIONAL) . .
-
- Editor's Note: The message described in this section is - applicable to other codecs beyond just VP9. In the future it - will be likely be split out into another document. - - TODO: details of how this is encoded in RTCP. - - A synchronization frame can be requested by sending a LIR, - which is an RTCP feedback message asking the encoder to encode a frame - which makes it possible to upgrade to a higher layer. The LIR - message contains two tuples, {T1,S1,Q1} and {T2,S2,Q2}, where - the first tuple is the currently highest layer the decoder can - decode, while the second tuple is the layer the decoder wants to - upgrade to. - - Identification of an upgrade frame can be derived from the - reference IDs of each frame by backtracking the dependency chain - until reaching a point where only decodable frames are being - referenced. Therefore it's recommended both for both the - flexible and the non-flexible mode that, when upgrade frames are - being encoded in response to a LIR, those packets should contain - layer indices and the reference fields so that the decoder or an - MCU can make this derivation. - - Example: - LIR {1,1,0}, {1,2,1} is sent by an MCU when it is currently - relaying {1,1,0} to a receiver and which wants to upgrade to - {1,2,1}. In response the encoder should encode the next frames - in layers {1,1,1} and {1,2,1} by only referring to frames in - {1,1,0}, {1,0,0} or {0,0,0}. - - In the non-flexible mode, periodic upgrade frames can be - defined by the layer structure of the SS, thus periodic upgrade - frames can be automatically identified by the picture ID. -
-
This payload format has two required parameters. @@ -844,8 +919,6 @@ X: | EXTENDED PID | (OPTIONAL) . . and heights up to 1552 pixels (97 macroblocks). - none - This media type is framed in RTP and contains binary data; see Section 4.8 of . @@ -868,6 +941,8 @@ X: | EXTENDED PID | (OPTIONAL) . . blankLines="0"/> For example: Video over IP, video conferencing. + N/A. + None. The clock rate in the "a=rtpmap" line MUST be 90000. The parameters "max-fs", and "max-fr", MUST be included in - the "a=fmtp" line of SDP. These parameters are expressed as a + the "a=fmtp" line of SDP if SDP is used to declare receiver capabilities. + These parameters are expressed as a media subtype string, in the form of a semicolon separated list of parameter=value pairs. @@ -927,14 +1003,7 @@ X: | EXTENDED PID | (OPTIONAL) . . target="RFC6386"/>, the max-fs and max-fr parameters MUST be used to establish these limits. - NOTE IN DRAFT: If closer control of width and height is desired, - the mechanism described in - draft-nandakumar-payload-sdp-max-video-resolution is a possible - candidate for signalling, but since that document appears to be far - from finalization, this document does not make a reference to that - document. This note is only intended for facilitating WG discussion, - and should be deleted before publication of this document as an - RFC. --> + -->
@@ -985,7 +1054,8 @@ X: | EXTENDED PID | (OPTIONAL) . .
- + + &vp9; &rfc2119; @@ -994,16 +1064,23 @@ X: | EXTENDED PID | (OPTIONAL) . . &rfc3550; - &rfc3711; - &rfc4566; &rfc6838; &rfc4855; - &rfc3551; + &lrr; + + + + + &rfc3551; + + &rfc3711; + +