From 224276fe21f8aae6124a6cba56334afec423d38d Mon Sep 17 00:00:00 2001 From: Ralph Giles Date: Wed, 21 Dec 2005 01:19:37 +0000 Subject: [PATCH] Set native eol-style. svn path=/trunk/vorbis/; revision=10667 --- doc/floor1_inverse_dB_table.html | 308 +++++------ doc/framing.html | 866 ++++++++++++++--------------- doc/helper.html | 478 ++++++++-------- doc/index.html | 228 ++++---- doc/oggstream.html | 468 ++++++++-------- doc/programming.html | 1108 +++++++++++++++++++------------------- doc/stereo.html | 836 ++++++++++++++-------------- doc/v-comment.html | 570 ++++++++++---------- doc/vorbis-fidelity.html | 360 ++++++------- doc/vorbis.html | 468 ++++++++-------- 10 files changed, 2845 insertions(+), 2845 deletions(-) diff --git a/doc/floor1_inverse_dB_table.html b/doc/floor1_inverse_dB_table.html index 7dc4acb..52d15e8 100644 --- a/doc/floor1_inverse_dB_table.html +++ b/doc/floor1_inverse_dB_table.html @@ -1,154 +1,154 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg Vorbis I format specification: floor1_inverse_dB_table

- -

The vector [floor1_inverse_dB_table] is a 256 element static -lookup table consiting of the following values (read left to right -then top to bottom):

- -
-  1.0649863e-07, 1.1341951e-07, 1.2079015e-07, 1.2863978e-07, 
-  1.3699951e-07, 1.4590251e-07, 1.5538408e-07, 1.6548181e-07, 
-  1.7623575e-07, 1.8768855e-07, 1.9988561e-07, 2.1287530e-07, 
-  2.2670913e-07, 2.4144197e-07, 2.5713223e-07, 2.7384213e-07, 
-  2.9163793e-07, 3.1059021e-07, 3.3077411e-07, 3.5226968e-07, 
-  3.7516214e-07, 3.9954229e-07, 4.2550680e-07, 4.5315863e-07, 
-  4.8260743e-07, 5.1396998e-07, 5.4737065e-07, 5.8294187e-07, 
-  6.2082472e-07, 6.6116941e-07, 7.0413592e-07, 7.4989464e-07, 
-  7.9862701e-07, 8.5052630e-07, 9.0579828e-07, 9.6466216e-07, 
-  1.0273513e-06, 1.0941144e-06, 1.1652161e-06, 1.2409384e-06, 
-  1.3215816e-06, 1.4074654e-06, 1.4989305e-06, 1.5963394e-06, 
-  1.7000785e-06, 1.8105592e-06, 1.9282195e-06, 2.0535261e-06, 
-  2.1869758e-06, 2.3290978e-06, 2.4804557e-06, 2.6416497e-06, 
-  2.8133190e-06, 2.9961443e-06, 3.1908506e-06, 3.3982101e-06, 
-  3.6190449e-06, 3.8542308e-06, 4.1047004e-06, 4.3714470e-06, 
-  4.6555282e-06, 4.9580707e-06, 5.2802740e-06, 5.6234160e-06, 
-  5.9888572e-06, 6.3780469e-06, 6.7925283e-06, 7.2339451e-06, 
-  7.7040476e-06, 8.2047000e-06, 8.7378876e-06, 9.3057248e-06, 
-  9.9104632e-06, 1.0554501e-05, 1.1240392e-05, 1.1970856e-05, 
-  1.2748789e-05, 1.3577278e-05, 1.4459606e-05, 1.5399272e-05, 
-  1.6400004e-05, 1.7465768e-05, 1.8600792e-05, 1.9809576e-05, 
-  2.1096914e-05, 2.2467911e-05, 2.3928002e-05, 2.5482978e-05, 
-  2.7139006e-05, 2.8902651e-05, 3.0780908e-05, 3.2781225e-05, 
-  3.4911534e-05, 3.7180282e-05, 3.9596466e-05, 4.2169667e-05, 
-  4.4910090e-05, 4.7828601e-05, 5.0936773e-05, 5.4246931e-05, 
-  5.7772202e-05, 6.1526565e-05, 6.5524908e-05, 6.9783085e-05, 
-  7.4317983e-05, 7.9147585e-05, 8.4291040e-05, 8.9768747e-05, 
-  9.5602426e-05, 0.00010181521, 0.00010843174, 0.00011547824, 
-  0.00012298267, 0.00013097477, 0.00013948625, 0.00014855085, 
-  0.00015820453, 0.00016848555, 0.00017943469, 0.00019109536, 
-  0.00020351382, 0.00021673929, 0.00023082423, 0.00024582449, 
-  0.00026179955, 0.00027881276, 0.00029693158, 0.00031622787, 
-  0.00033677814, 0.00035866388, 0.00038197188, 0.00040679456, 
-  0.00043323036, 0.00046138411, 0.00049136745, 0.00052329927, 
-  0.00055730621, 0.00059352311, 0.00063209358, 0.00067317058, 
-  0.00071691700, 0.00076350630, 0.00081312324, 0.00086596457, 
-  0.00092223983, 0.00098217216, 0.0010459992,  0.0011139742, 
-  0.0011863665,  0.0012634633,  0.0013455702,  0.0014330129, 
-  0.0015261382,  0.0016253153,  0.0017309374,  0.0018434235, 
-  0.0019632195,  0.0020908006,  0.0022266726,  0.0023713743, 
-  0.0025254795,  0.0026895994,  0.0028643847,  0.0030505286, 
-  0.0032487691,  0.0034598925,  0.0036847358,  0.0039241906, 
-  0.0041792066,  0.0044507950,  0.0047400328,  0.0050480668, 
-  0.0053761186,  0.0057254891,  0.0060975636,  0.0064938176, 
-  0.0069158225,  0.0073652516,  0.0078438871,  0.0083536271, 
-  0.0088964928,  0.009474637,   0.010090352,   0.010746080, 
-  0.011444421,   0.012188144,   0.012980198,   0.013823725, 
-  0.014722068,   0.015678791,   0.016697687,   0.017782797, 
-  0.018938423,   0.020169149,   0.021479854,   0.022875735, 
-  0.024362330,   0.025945531,   0.027631618,   0.029427276, 
-  0.031339626,   0.033376252,   0.035545228,   0.037855157, 
-  0.040315199,   0.042935108,   0.045725273,   0.048696758, 
-  0.051861348,   0.055231591,   0.058820850,   0.062643361, 
-  0.066714279,   0.071049749,   0.075666962,   0.080584227, 
-  0.085821044,   0.091398179,   0.097337747,   0.10366330, 
-  0.11039993,    0.11757434,    0.12521498,    0.13335215, 
-  0.14201813,    0.15124727,    0.16107617,    0.17154380, 
-  0.18269168,    0.19456402,    0.20720788,    0.22067342, 
-  0.23501402,    0.25028656,    0.26655159,    0.28387361, 
-  0.30232132,    0.32196786,    0.34289114,    0.36517414, 
-  0.38890521,    0.41417847,    0.44109412,    0.46975890, 
-  0.50028648,    0.53279791,    0.56742212,    0.60429640, 
-  0.64356699,    0.68538959,    0.72993007,    0.77736504, 
-  0.82788260,    0.88168307,    0.9389798,     1.
-
- - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg Vorbis I format specification: floor1_inverse_dB_table

+ +

The vector [floor1_inverse_dB_table] is a 256 element static +lookup table consiting of the following values (read left to right +then top to bottom):

+ +
+  1.0649863e-07, 1.1341951e-07, 1.2079015e-07, 1.2863978e-07, 
+  1.3699951e-07, 1.4590251e-07, 1.5538408e-07, 1.6548181e-07, 
+  1.7623575e-07, 1.8768855e-07, 1.9988561e-07, 2.1287530e-07, 
+  2.2670913e-07, 2.4144197e-07, 2.5713223e-07, 2.7384213e-07, 
+  2.9163793e-07, 3.1059021e-07, 3.3077411e-07, 3.5226968e-07, 
+  3.7516214e-07, 3.9954229e-07, 4.2550680e-07, 4.5315863e-07, 
+  4.8260743e-07, 5.1396998e-07, 5.4737065e-07, 5.8294187e-07, 
+  6.2082472e-07, 6.6116941e-07, 7.0413592e-07, 7.4989464e-07, 
+  7.9862701e-07, 8.5052630e-07, 9.0579828e-07, 9.6466216e-07, 
+  1.0273513e-06, 1.0941144e-06, 1.1652161e-06, 1.2409384e-06, 
+  1.3215816e-06, 1.4074654e-06, 1.4989305e-06, 1.5963394e-06, 
+  1.7000785e-06, 1.8105592e-06, 1.9282195e-06, 2.0535261e-06, 
+  2.1869758e-06, 2.3290978e-06, 2.4804557e-06, 2.6416497e-06, 
+  2.8133190e-06, 2.9961443e-06, 3.1908506e-06, 3.3982101e-06, 
+  3.6190449e-06, 3.8542308e-06, 4.1047004e-06, 4.3714470e-06, 
+  4.6555282e-06, 4.9580707e-06, 5.2802740e-06, 5.6234160e-06, 
+  5.9888572e-06, 6.3780469e-06, 6.7925283e-06, 7.2339451e-06, 
+  7.7040476e-06, 8.2047000e-06, 8.7378876e-06, 9.3057248e-06, 
+  9.9104632e-06, 1.0554501e-05, 1.1240392e-05, 1.1970856e-05, 
+  1.2748789e-05, 1.3577278e-05, 1.4459606e-05, 1.5399272e-05, 
+  1.6400004e-05, 1.7465768e-05, 1.8600792e-05, 1.9809576e-05, 
+  2.1096914e-05, 2.2467911e-05, 2.3928002e-05, 2.5482978e-05, 
+  2.7139006e-05, 2.8902651e-05, 3.0780908e-05, 3.2781225e-05, 
+  3.4911534e-05, 3.7180282e-05, 3.9596466e-05, 4.2169667e-05, 
+  4.4910090e-05, 4.7828601e-05, 5.0936773e-05, 5.4246931e-05, 
+  5.7772202e-05, 6.1526565e-05, 6.5524908e-05, 6.9783085e-05, 
+  7.4317983e-05, 7.9147585e-05, 8.4291040e-05, 8.9768747e-05, 
+  9.5602426e-05, 0.00010181521, 0.00010843174, 0.00011547824, 
+  0.00012298267, 0.00013097477, 0.00013948625, 0.00014855085, 
+  0.00015820453, 0.00016848555, 0.00017943469, 0.00019109536, 
+  0.00020351382, 0.00021673929, 0.00023082423, 0.00024582449, 
+  0.00026179955, 0.00027881276, 0.00029693158, 0.00031622787, 
+  0.00033677814, 0.00035866388, 0.00038197188, 0.00040679456, 
+  0.00043323036, 0.00046138411, 0.00049136745, 0.00052329927, 
+  0.00055730621, 0.00059352311, 0.00063209358, 0.00067317058, 
+  0.00071691700, 0.00076350630, 0.00081312324, 0.00086596457, 
+  0.00092223983, 0.00098217216, 0.0010459992,  0.0011139742, 
+  0.0011863665,  0.0012634633,  0.0013455702,  0.0014330129, 
+  0.0015261382,  0.0016253153,  0.0017309374,  0.0018434235, 
+  0.0019632195,  0.0020908006,  0.0022266726,  0.0023713743, 
+  0.0025254795,  0.0026895994,  0.0028643847,  0.0030505286, 
+  0.0032487691,  0.0034598925,  0.0036847358,  0.0039241906, 
+  0.0041792066,  0.0044507950,  0.0047400328,  0.0050480668, 
+  0.0053761186,  0.0057254891,  0.0060975636,  0.0064938176, 
+  0.0069158225,  0.0073652516,  0.0078438871,  0.0083536271, 
+  0.0088964928,  0.009474637,   0.010090352,   0.010746080, 
+  0.011444421,   0.012188144,   0.012980198,   0.013823725, 
+  0.014722068,   0.015678791,   0.016697687,   0.017782797, 
+  0.018938423,   0.020169149,   0.021479854,   0.022875735, 
+  0.024362330,   0.025945531,   0.027631618,   0.029427276, 
+  0.031339626,   0.033376252,   0.035545228,   0.037855157, 
+  0.040315199,   0.042935108,   0.045725273,   0.048696758, 
+  0.051861348,   0.055231591,   0.058820850,   0.062643361, 
+  0.066714279,   0.071049749,   0.075666962,   0.080584227, 
+  0.085821044,   0.091398179,   0.097337747,   0.10366330, 
+  0.11039993,    0.11757434,    0.12521498,    0.13335215, 
+  0.14201813,    0.15124727,    0.16107617,    0.17154380, 
+  0.18269168,    0.19456402,    0.20720788,    0.22067342, 
+  0.23501402,    0.25028656,    0.26655159,    0.28387361, 
+  0.30232132,    0.32196786,    0.34289114,    0.36517414, 
+  0.38890521,    0.41417847,    0.44109412,    0.46975890, 
+  0.50028648,    0.53279791,    0.56742212,    0.60429640, 
+  0.64356699,    0.68538959,    0.72993007,    0.77736504, 
+  0.82788260,    0.88168307,    0.9389798,     1.
+
+ + + + + diff --git a/doc/framing.html b/doc/framing.html index 91b8bb8..e0726b8 100644 --- a/doc/framing.html +++ b/doc/framing.html @@ -1,433 +1,433 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg logical bitstream framing

- -

Ogg bitstreams

- -

The Ogg transport bitstream is designed to provide framing, error -protection and seeking structure for higher-level codec streams that -consist of raw, unencapsulated data packets, such as the Vorbis audio -codec or Tarkin video codec.

- -

Application example: Vorbis

- -

Vorbis encodes short-time blocks of PCM data into raw packets of -bit-packed data. These raw packets may be used directly by transport -mechanisms that provide their own framing and packet-separation -mechanisms (such as UDP datagrams). For stream based storage (such as -files) and transport (such as TCP streams or pipes), Vorbis uses the -Ogg bitstream format to provide framing/sync, sync recapture -after error, landmarks during seeking, and enough information to -properly separate data back into packets at the original packet -boundaries without relying on decoding to find packet boundaries.

- -

Design constraints for Ogg bitstreams

- -
    -
  1. True streaming; we must not need to seek to build a 100% - complete bitstream.
  2. -
  3. Use no more than approximately 1-2% of bitstream bandwidth for - packet boundary marking, high-level framing, sync and seeking.
  4. -
  5. Specification of absolute position within the original sample - stream.
  6. -
  7. Simple mechanism to ease limited editing, such as a simplified - concatenation mechanism.
  8. -
  9. Detection of corruption, recapture after error and direct, random - access to data at arbitrary positions in the bitstream.
  10. -
- -

Logical and Physical Bitstreams

- -

A logical Ogg bitstream is a contiguous stream of -sequential pages belonging only to the logical bitstream. A -physical Ogg bitstream is constructed from one or more -than one logical Ogg bitstream (the simplest physical bitstream -is simply a single logical bitstream). We describe below the exact -formatting of an Ogg logical bitstream. Combining logical -bitstreams into more complex physical bitstreams is described in the -Ogg bitstream overview. The exact -mapping of raw Vorbis packets into a valid Ogg Vorbis physical -bitstream is described in Vorbis -bitstream mapping.

- -

Bitstream structure

- -

An Ogg stream is structured by dividing incoming packets into -segments of up to 255 bytes and then wrapping a group of contiguous -packet segments into a variable length page preceded by a page -header. Both the header size and page size are variable; the page -header contains sizing information and checksum data to determine -header/page size and data integrity.

- -

The bitstream is captured (or recaptured) by looking for the beginning -of a page, specifically the capture pattern. Once the capture pattern -is found, the decoder verifies page sync and integrity by computing -and comparing the checksum. At that point, the decoder can extract the -packets themselves.

- -

Packet segmentation

- -

Packets are logically divided into multiple segments before encoding -into a page. Note that the segmentation and fragmentation process is a -logical one; it's used to compute page header values and the original -page data need not be disturbed, even when a packet spans page -boundaries.

- -

The raw packet is logically divided into [n] 255 byte segments and a -last fractional segment of < 255 bytes. A packet size may well -consist only of the trailing fractional segment, and a fractional -segment may be zero length. These values, called "lacing values" are -then saved and placed into the header segment table.

- -

An example should make the basic concept clear:

- -
-
-raw packet:
-  ___________________________________________
- |______________packet data__________________| 753 bytes
-
-lacing values for page header segment table: 255,255,243
-
-
- -

We simply add the lacing values for the total size; the last lacing -value for a packet is always the value that is less than 255. Note -that this encoding both avoids imposing a maximum packet size as well -as imposing minimum overhead on small packets (as opposed to, eg, -simply using two bytes at the head of every packet and having a max -packet size of 32k. Small packets (<255, the typical case) are -penalized with twice the segmentation overhead). Using the lacing -values as suggested, small packets see the minimum possible -byte-aligned overheade (1 byte) and large packets, over 512 bytes or -so, see a fairly constant ~.5% overhead on encoding space.

- -

Note that a lacing value of 255 implies that a second lacing value -follows in the packet, and a value of < 255 marks the end of the -packet after that many additional bytes. A packet of 255 bytes (or a -multiple of 255 bytes) is terminated by a lacing value of 0:

- -

-raw packet:
-  _______________________________
- |________packet data____________|          255 bytes
-
-lacing values: 255, 0
-
- -

Note also that a 'nil' (zero length) packet is not an error; it -consists of nothing more than a lacing value of zero in the header.

- -

Packets spanning pages

- -

Packets are not restricted to beginning and ending within a page, -although individual segments are, by definition, required to do so. -Packets are not restricted to a maximum size, although excessively -large packets in the data stream are discouraged; the Ogg -bitstream specification strongly recommends nominal page size of -approximately 4-8kB (large packets are foreseen as being useful for -initialization data at the beginning of a logical bitstream).

- -

After segmenting a packet, the encoder may decide not to place all the -resulting segments into the current page; to do so, the encoder places -the lacing values of the segments it wishes to belong to the current -page into the current segment table, then finishes the page. The next -page is begun with the first value in the segment table belonging to -the next packet segment, thus continuing the packet (data in the -packet body must also correspond properly to the lacing values in the -spanned pages. The segment data in the first packet corresponding to -the lacing values of the first page belong in that page; packet -segments listed in the segment table of the following page must begin -the page body of the subsequent page).

- -

The last mechanic to spanning a page boundary is to set the header -flag in the new page to indicate that the first lacing value in the -segment table continues rather than begins a packet; a header flag of -0x01 is set to indicate a continued packet. Although mandatory, it -is not actually algorithmically necessary; one could inspect the -preceding segment table to determine if the packet is new or -continued. Adding the information to the packet_header flag allows a -simpler design (with no overhead) that needs only inspect the current -page header after frame capture. This also allows faster error -recovery in the event that the packet originates in a corrupt -preceding page, implying that the previous page's segment table -cannot be trusted.

- -

Note that a packet can span an arbitrary number of pages; the above -spanning process is repeated for each spanned page boundary. Also a -'zero termination' on a packet size that is an even multiple of 255 -must appear even if the lacing value appears in the next page as a -zero-length continuation of the current packet. The header flag -should be set to 0x01 to indicate that the packet spanned, even though -the span is a nil case as far as data is concerned.

- -

The encoding looks odd, but is properly optimized for speed and the -expected case of the majority of packets being between 50 and 200 -bytes (note that it is designed such that packets of wildly different -sizes can be handled within the model; placing packet size -restrictions on the encoder would have only slightly simplified design -in page generation and increased overall encoder complexity).

- -

The main point behind tracking individual packets (and packet -segments) is to allow more flexible encoding tricks that requiring -explicit knowledge of packet size. An example is simple bandwidth -limiting, implemented by simply truncating packets in the nominal case -if the packet is arranged so that the least sensitive portion of the -data comes last.

- -

Page header

- -

The headering mechanism is designed to avoid copying and re-assembly -of the packet data (ie, making the packet segmentation process a -logical one); the header can be generated directly from incoming -packet data. The encoder buffers packet data until it finishes a -complete page at which point it writes the header followed by the -buffered packet segments.

- -

capture_pattern

- -

A header begins with a capture pattern that simplifies identifying -pages; once the decoder has found the capture pattern it can do a more -intensive job of verifying that it has in fact found a page boundary -(as opposed to an inadvertent coincidence in the byte stream).

- -

- byte value
-
-  0  0x4f 'O'
-  1  0x67 'g'
-  2  0x67 'g'
-  3  0x53 'S'  
-
- -

stream_structure_version

- -

The capture pattern is followed by the stream structure revision:

- -

- byte value
-
-  4  0x00
-
- -

header_type_flag

- -

The header type flag identifies this page's context in the bitstream:

- -

- byte value
-
-  5  bitflags: 0x01: unset = fresh packet
-	               set = continued packet
-	       0x02: unset = not first page of logical bitstream
-                       set = first page of logical bitstream (bos)
-	       0x04: unset = not last page of logical bitstream
-                       set = last page of logical bitstream (eos)
-
- -

absolute granule position

- -

(This is packed in the same way the rest of Ogg data is packed; LSb -of LSB first. Note that the 'position' data specifies a 'sample' -number (eg, in a CD quality sample is four octets, 16 bits for left -and 16 bits for right; in video it would likely be the frame number. -It is up to the specific codec in use to define the semantic meaning -of the granule position value). The position specified is the total -samples encoded after including all packets finished on this page -(packets begun on this page but continuing on to the next page do not -count). The rationale here is that the position specified in the -frame header of the last page tells how long the data coded by the -bitstream is. A truncated stream will still return the proper number -of samples that can be decoded fully.

- -

A special value of '-1' (in two's complement) indicates that no packets -finish on this page.

- -

- byte value
-
-  6  0xXX LSB
-  7  0xXX
-  8  0xXX
-  9  0xXX
- 10  0xXX
- 11  0xXX
- 12  0xXX
- 13  0xXX MSB
-
- -

stream serial number

- -

Ogg allows for separate logical bitstreams to be mixed at page -granularity in a physical bitstream. The most common case would be -sequential arrangement, but it is possible to interleave pages for -two separate bitstreams to be decoded concurrently. The serial -number is the means by which pages physical pages are associated with -a particular logical stream. Each logical stream must have a unique -serial number within a physical stream:

- -

- byte value
-
- 14  0xXX LSB
- 15  0xXX
- 16  0xXX
- 17  0xXX MSB
-
- -

page sequence no

- -

Page counter; lets us know if a page is lost (useful where packets -span page boundaries).

- -

- byte value
-
- 18  0xXX LSB
- 19  0xXX
- 20  0xXX
- 21  0xXX MSB
-
- -

page checksum

- -

32 bit CRC value (direct algorithm, initial val and final XOR = 0, -generator polynomial=0x04c11db7). The value is computed over the -entire header (with the CRC field in the header set to zero) and then -continued over the page. The CRC field is then filled with the -computed value.

- -

(A thorough discussion of CRC algorithms can be found in "A -Painless Guide to CRC Error Detection Algorithms" by Ross -Williams ross@guest.adelaide.edu.au.)

- -

- byte value
-
- 22  0xXX LSB
- 23  0xXX
- 24  0xXX
- 25  0xXX MSB
-
- -

page_segments

- -

The number of segment entries to appear in the segment table. The -maximum number of 255 segments (255 bytes each) sets the maximum -possible physical page size at 65307 bytes or just under 64kB (thus -we know that a header corrupted so as destroy sizing/alignment -information will not cause a runaway bitstream. We'll read in the -page according to the corrupted size information that's guaranteed to -be a reasonable size regardless, notice the checksum mismatch, drop -sync and then look for recapture).

- -

- byte value
-
- 26 0x00-0xff (0-255)
-
- -

segment_table (containing packet lacing values)

- -

The lacing values for each packet segment physically appearing in -this page are listed in contiguous order.

- -

- byte value
-
- 27 0x00-0xff (0-255)
- [...]
- n  0x00-0xff (0-255, n=page_segments+26)
-
- -

Total page size is calculated directly from the known header size and -lacing values in the segment table. Packet data segments follow -immediately after the header.

- -

Page headers typically impose a flat .25-.5% space overhead assuming -nominal ~8k page sizes. The segmentation table needed for exact -packet recovery in the streaming layer adds approximately .5-1% -nominal assuming expected encoder behavior in the 44.1kHz, 128kbps -stereo encodings.

- - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg logical bitstream framing

+ +

Ogg bitstreams

+ +

The Ogg transport bitstream is designed to provide framing, error +protection and seeking structure for higher-level codec streams that +consist of raw, unencapsulated data packets, such as the Vorbis audio +codec or Tarkin video codec.

+ +

Application example: Vorbis

+ +

Vorbis encodes short-time blocks of PCM data into raw packets of +bit-packed data. These raw packets may be used directly by transport +mechanisms that provide their own framing and packet-separation +mechanisms (such as UDP datagrams). For stream based storage (such as +files) and transport (such as TCP streams or pipes), Vorbis uses the +Ogg bitstream format to provide framing/sync, sync recapture +after error, landmarks during seeking, and enough information to +properly separate data back into packets at the original packet +boundaries without relying on decoding to find packet boundaries.

+ +

Design constraints for Ogg bitstreams

+ +
    +
  1. True streaming; we must not need to seek to build a 100% + complete bitstream.
  2. +
  3. Use no more than approximately 1-2% of bitstream bandwidth for + packet boundary marking, high-level framing, sync and seeking.
  4. +
  5. Specification of absolute position within the original sample + stream.
  6. +
  7. Simple mechanism to ease limited editing, such as a simplified + concatenation mechanism.
  8. +
  9. Detection of corruption, recapture after error and direct, random + access to data at arbitrary positions in the bitstream.
  10. +
+ +

Logical and Physical Bitstreams

+ +

A logical Ogg bitstream is a contiguous stream of +sequential pages belonging only to the logical bitstream. A +physical Ogg bitstream is constructed from one or more +than one logical Ogg bitstream (the simplest physical bitstream +is simply a single logical bitstream). We describe below the exact +formatting of an Ogg logical bitstream. Combining logical +bitstreams into more complex physical bitstreams is described in the +Ogg bitstream overview. The exact +mapping of raw Vorbis packets into a valid Ogg Vorbis physical +bitstream is described in Vorbis +bitstream mapping.

+ +

Bitstream structure

+ +

An Ogg stream is structured by dividing incoming packets into +segments of up to 255 bytes and then wrapping a group of contiguous +packet segments into a variable length page preceded by a page +header. Both the header size and page size are variable; the page +header contains sizing information and checksum data to determine +header/page size and data integrity.

+ +

The bitstream is captured (or recaptured) by looking for the beginning +of a page, specifically the capture pattern. Once the capture pattern +is found, the decoder verifies page sync and integrity by computing +and comparing the checksum. At that point, the decoder can extract the +packets themselves.

+ +

Packet segmentation

+ +

Packets are logically divided into multiple segments before encoding +into a page. Note that the segmentation and fragmentation process is a +logical one; it's used to compute page header values and the original +page data need not be disturbed, even when a packet spans page +boundaries.

+ +

The raw packet is logically divided into [n] 255 byte segments and a +last fractional segment of < 255 bytes. A packet size may well +consist only of the trailing fractional segment, and a fractional +segment may be zero length. These values, called "lacing values" are +then saved and placed into the header segment table.

+ +

An example should make the basic concept clear:

+ +
+
+raw packet:
+  ___________________________________________
+ |______________packet data__________________| 753 bytes
+
+lacing values for page header segment table: 255,255,243
+
+
+ +

We simply add the lacing values for the total size; the last lacing +value for a packet is always the value that is less than 255. Note +that this encoding both avoids imposing a maximum packet size as well +as imposing minimum overhead on small packets (as opposed to, eg, +simply using two bytes at the head of every packet and having a max +packet size of 32k. Small packets (<255, the typical case) are +penalized with twice the segmentation overhead). Using the lacing +values as suggested, small packets see the minimum possible +byte-aligned overheade (1 byte) and large packets, over 512 bytes or +so, see a fairly constant ~.5% overhead on encoding space.

+ +

Note that a lacing value of 255 implies that a second lacing value +follows in the packet, and a value of < 255 marks the end of the +packet after that many additional bytes. A packet of 255 bytes (or a +multiple of 255 bytes) is terminated by a lacing value of 0:

+ +

+raw packet:
+  _______________________________
+ |________packet data____________|          255 bytes
+
+lacing values: 255, 0
+
+ +

Note also that a 'nil' (zero length) packet is not an error; it +consists of nothing more than a lacing value of zero in the header.

+ +

Packets spanning pages

+ +

Packets are not restricted to beginning and ending within a page, +although individual segments are, by definition, required to do so. +Packets are not restricted to a maximum size, although excessively +large packets in the data stream are discouraged; the Ogg +bitstream specification strongly recommends nominal page size of +approximately 4-8kB (large packets are foreseen as being useful for +initialization data at the beginning of a logical bitstream).

+ +

After segmenting a packet, the encoder may decide not to place all the +resulting segments into the current page; to do so, the encoder places +the lacing values of the segments it wishes to belong to the current +page into the current segment table, then finishes the page. The next +page is begun with the first value in the segment table belonging to +the next packet segment, thus continuing the packet (data in the +packet body must also correspond properly to the lacing values in the +spanned pages. The segment data in the first packet corresponding to +the lacing values of the first page belong in that page; packet +segments listed in the segment table of the following page must begin +the page body of the subsequent page).

+ +

The last mechanic to spanning a page boundary is to set the header +flag in the new page to indicate that the first lacing value in the +segment table continues rather than begins a packet; a header flag of +0x01 is set to indicate a continued packet. Although mandatory, it +is not actually algorithmically necessary; one could inspect the +preceding segment table to determine if the packet is new or +continued. Adding the information to the packet_header flag allows a +simpler design (with no overhead) that needs only inspect the current +page header after frame capture. This also allows faster error +recovery in the event that the packet originates in a corrupt +preceding page, implying that the previous page's segment table +cannot be trusted.

+ +

Note that a packet can span an arbitrary number of pages; the above +spanning process is repeated for each spanned page boundary. Also a +'zero termination' on a packet size that is an even multiple of 255 +must appear even if the lacing value appears in the next page as a +zero-length continuation of the current packet. The header flag +should be set to 0x01 to indicate that the packet spanned, even though +the span is a nil case as far as data is concerned.

+ +

The encoding looks odd, but is properly optimized for speed and the +expected case of the majority of packets being between 50 and 200 +bytes (note that it is designed such that packets of wildly different +sizes can be handled within the model; placing packet size +restrictions on the encoder would have only slightly simplified design +in page generation and increased overall encoder complexity).

+ +

The main point behind tracking individual packets (and packet +segments) is to allow more flexible encoding tricks that requiring +explicit knowledge of packet size. An example is simple bandwidth +limiting, implemented by simply truncating packets in the nominal case +if the packet is arranged so that the least sensitive portion of the +data comes last.

+ +

Page header

+ +

The headering mechanism is designed to avoid copying and re-assembly +of the packet data (ie, making the packet segmentation process a +logical one); the header can be generated directly from incoming +packet data. The encoder buffers packet data until it finishes a +complete page at which point it writes the header followed by the +buffered packet segments.

+ +

capture_pattern

+ +

A header begins with a capture pattern that simplifies identifying +pages; once the decoder has found the capture pattern it can do a more +intensive job of verifying that it has in fact found a page boundary +(as opposed to an inadvertent coincidence in the byte stream).

+ +

+ byte value
+
+  0  0x4f 'O'
+  1  0x67 'g'
+  2  0x67 'g'
+  3  0x53 'S'  
+
+ +

stream_structure_version

+ +

The capture pattern is followed by the stream structure revision:

+ +

+ byte value
+
+  4  0x00
+
+ +

header_type_flag

+ +

The header type flag identifies this page's context in the bitstream:

+ +

+ byte value
+
+  5  bitflags: 0x01: unset = fresh packet
+	               set = continued packet
+	       0x02: unset = not first page of logical bitstream
+                       set = first page of logical bitstream (bos)
+	       0x04: unset = not last page of logical bitstream
+                       set = last page of logical bitstream (eos)
+
+ +

absolute granule position

+ +

(This is packed in the same way the rest of Ogg data is packed; LSb +of LSB first. Note that the 'position' data specifies a 'sample' +number (eg, in a CD quality sample is four octets, 16 bits for left +and 16 bits for right; in video it would likely be the frame number. +It is up to the specific codec in use to define the semantic meaning +of the granule position value). The position specified is the total +samples encoded after including all packets finished on this page +(packets begun on this page but continuing on to the next page do not +count). The rationale here is that the position specified in the +frame header of the last page tells how long the data coded by the +bitstream is. A truncated stream will still return the proper number +of samples that can be decoded fully.

+ +

A special value of '-1' (in two's complement) indicates that no packets +finish on this page.

+ +

+ byte value
+
+  6  0xXX LSB
+  7  0xXX
+  8  0xXX
+  9  0xXX
+ 10  0xXX
+ 11  0xXX
+ 12  0xXX
+ 13  0xXX MSB
+
+ +

stream serial number

+ +

Ogg allows for separate logical bitstreams to be mixed at page +granularity in a physical bitstream. The most common case would be +sequential arrangement, but it is possible to interleave pages for +two separate bitstreams to be decoded concurrently. The serial +number is the means by which pages physical pages are associated with +a particular logical stream. Each logical stream must have a unique +serial number within a physical stream:

+ +

+ byte value
+
+ 14  0xXX LSB
+ 15  0xXX
+ 16  0xXX
+ 17  0xXX MSB
+
+ +

page sequence no

+ +

Page counter; lets us know if a page is lost (useful where packets +span page boundaries).

+ +

+ byte value
+
+ 18  0xXX LSB
+ 19  0xXX
+ 20  0xXX
+ 21  0xXX MSB
+
+ +

page checksum

+ +

32 bit CRC value (direct algorithm, initial val and final XOR = 0, +generator polynomial=0x04c11db7). The value is computed over the +entire header (with the CRC field in the header set to zero) and then +continued over the page. The CRC field is then filled with the +computed value.

+ +

(A thorough discussion of CRC algorithms can be found in "A +Painless Guide to CRC Error Detection Algorithms" by Ross +Williams ross@guest.adelaide.edu.au.)

+ +

+ byte value
+
+ 22  0xXX LSB
+ 23  0xXX
+ 24  0xXX
+ 25  0xXX MSB
+
+ +

page_segments

+ +

The number of segment entries to appear in the segment table. The +maximum number of 255 segments (255 bytes each) sets the maximum +possible physical page size at 65307 bytes or just under 64kB (thus +we know that a header corrupted so as destroy sizing/alignment +information will not cause a runaway bitstream. We'll read in the +page according to the corrupted size information that's guaranteed to +be a reasonable size regardless, notice the checksum mismatch, drop +sync and then look for recapture).

+ +

+ byte value
+
+ 26 0x00-0xff (0-255)
+
+ +

segment_table (containing packet lacing values)

+ +

The lacing values for each packet segment physically appearing in +this page are listed in contiguous order.

+ +

+ byte value
+
+ 27 0x00-0xff (0-255)
+ [...]
+ n  0x00-0xff (0-255, n=page_segments+26)
+
+ +

Total page size is calculated directly from the known header size and +lacing values in the segment table. Packet data segments follow +immediately after the header.

+ +

Page headers typically impose a flat .25-.5% space overhead assuming +nominal ~8k page sizes. The segmentation table needed for exact +packet recovery in the streaming layer adds approximately .5-1% +nominal assuming expected encoder behavior in the 44.1kHz, 128kbps +stereo encodings.

+ + + + + diff --git a/doc/helper.html b/doc/helper.html index 5ad551a..f8ff0c7 100644 --- a/doc/helper.html +++ b/doc/helper.html @@ -1,239 +1,239 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg Vorbis I format specification: helper equations

- -

Overview

- -

The equations below are used in multiple places by the Vorbis codec -specification. Rather than cluttering up the main specification -documents, they are defined here and linked in the main documents -where appropriate.

- -

ilog

- -

The "ilog(x)" function returns the position number (1 through n) of the -highest set bit in the two's complement integer value -[x]. Values of [x] less than zero are defined to return zero.

- -
-  1) [return_value] = 0;
-  2) if ( [x] is greater than zero ){
-      
-       3) increment [return_value];
-       4) logical shift [x] one bit to the right, padding the MSb with zero
-       5) repeat at step 2)
-
-     }
-
-   6) done
-
- -

Examples:

- - - -

float32_unpack

- -

"float32_unpack(x)" is intended to translate the packed binary -representation of a Vorbis codebook float value into the -representation used by the decoder for floating point numbers. For -purposes of this example, we will unpack a Vorbis float32 into a -host-native floating point number.

- -
-  1) [mantissa] = [x] bitwise AND 0x1fffff (unsigned result)
-  2) [sign] = [x] bitwise AND 0x80000000 (unsigned result)
-  3) [exponent] = ( [x] bitwise AND 0x7fe00000) shifted right 21 bits (unsigned result)
-  4) if ( [sign] is nonzero ) then negate [mantissa]
-  5) return [mantissa] * ( 2 ^ ( [exponent] - 788 ) )
-
- -

lookup1_values

- -

"lookup1_values(codebook_entries,codebook_dimensions)" is used to -compute the correct length of the value index for a codebook VQ lookup -table of lookup type 1. The values on this list are permuted to -construct the VQ vector lookup table of size -[codebook_entries].

- -

The return value for this function is defined to be 'the greatest -integer value for which [return_value] to the power of -[codebook_dimensions] is less than or equal to -[codebook_entries]'.

- -

low_neighbor

- -

"low_neighbor(v,x)" finds the position n in vector [v] of -the greatest value scalar element for which n is less than -[x] and vector [v] element n is less -than vector [v] element [x].

- -

high_neighbor

- -

"high_neighbor(v,x)" finds the position n in vector [v] of -the lowest value scalar element for which n is less than -[x] and vector [v] element n is greater -than vector [v] element [x].

- -

render_point

- -

"render_point(x0,y0,x1,y1,X)" is used to find the Y value at point X -along the line specified by x0, x1, y0 and y1. This function uses an -integer algorithm to solve for the point directly without calculating -intervening values along the line.

- -
-  1)  [dy] = [y1] - [y0]
-  2) [adx] = [x1] - [x0]
-  3) [ady] = absolute value of [dy]
-  4) [err] = [ady] * ([X] - [x0])
-  5) [off] = [err] / [adx] using integer division
-  6) if ( [dy] is less than zero ) {
-
-       7) [Y] = [y0] - [off]
-
-     } else {
-
-       8) [Y] = [y0] + [off]
-  
-     }
-
-  9) done
-
- -

render_line

- -

Floor decode type one uses the integer line drawing algorithm of -"render_line(x0, y0, x1, y1, v)" to construct an integer floor -curve for contiguous piecewise line segments. Note that it has not -been relevant elsewhere, but here we must define integer division as -rounding division of both positive and negative numbers toward zero.

- -
-  1)   [dy] = [y1] - [y0]
-  2)  [adx] = [x1] - [x0]
-  3)  [ady] = absolute value of [dy]
-  4) [base] = [dy] / [adx] using integer division
-  5)    [x] = [x0]
-  6)    [y] = [y0]
-  7)  [err] = 0
-
-  8) if ( [dy] is less than 0 ) {
-
-        9) [sy] = [base] - 1
-
-     } else {
-
-       10) [sy] = [base] + 1
-
-     }
-
- 11) [ady] = [ady] - (absolute value of [base]) * [adx]
- 12) vector [v] element [x] = [y]
-
- 13) iterate [x] over the range [x0]+1 ... [x1]-1 {
-
-       14) [err] = [err] + [ady];
-       15) if ( [err] >= [adx] ) {
-
-             15) [err] = [err] - [adx]
-             16)   [y] = [y] + [sy]
-
-           } else {
-
-             17) [y] = [y] + [base]
-   
-           }
-
-       18) vector [v] element [x] = [y]
-
-     }
-
- - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg Vorbis I format specification: helper equations

+ +

Overview

+ +

The equations below are used in multiple places by the Vorbis codec +specification. Rather than cluttering up the main specification +documents, they are defined here and linked in the main documents +where appropriate.

+ +

ilog

+ +

The "ilog(x)" function returns the position number (1 through n) of the +highest set bit in the two's complement integer value +[x]. Values of [x] less than zero are defined to return zero.

+ +
+  1) [return_value] = 0;
+  2) if ( [x] is greater than zero ){
+      
+       3) increment [return_value];
+       4) logical shift [x] one bit to the right, padding the MSb with zero
+       5) repeat at step 2)
+
+     }
+
+   6) done
+
+ +

Examples:

+ + + +

float32_unpack

+ +

"float32_unpack(x)" is intended to translate the packed binary +representation of a Vorbis codebook float value into the +representation used by the decoder for floating point numbers. For +purposes of this example, we will unpack a Vorbis float32 into a +host-native floating point number.

+ +
+  1) [mantissa] = [x] bitwise AND 0x1fffff (unsigned result)
+  2) [sign] = [x] bitwise AND 0x80000000 (unsigned result)
+  3) [exponent] = ( [x] bitwise AND 0x7fe00000) shifted right 21 bits (unsigned result)
+  4) if ( [sign] is nonzero ) then negate [mantissa]
+  5) return [mantissa] * ( 2 ^ ( [exponent] - 788 ) )
+
+ +

lookup1_values

+ +

"lookup1_values(codebook_entries,codebook_dimensions)" is used to +compute the correct length of the value index for a codebook VQ lookup +table of lookup type 1. The values on this list are permuted to +construct the VQ vector lookup table of size +[codebook_entries].

+ +

The return value for this function is defined to be 'the greatest +integer value for which [return_value] to the power of +[codebook_dimensions] is less than or equal to +[codebook_entries]'.

+ +

low_neighbor

+ +

"low_neighbor(v,x)" finds the position n in vector [v] of +the greatest value scalar element for which n is less than +[x] and vector [v] element n is less +than vector [v] element [x].

+ +

high_neighbor

+ +

"high_neighbor(v,x)" finds the position n in vector [v] of +the lowest value scalar element for which n is less than +[x] and vector [v] element n is greater +than vector [v] element [x].

+ +

render_point

+ +

"render_point(x0,y0,x1,y1,X)" is used to find the Y value at point X +along the line specified by x0, x1, y0 and y1. This function uses an +integer algorithm to solve for the point directly without calculating +intervening values along the line.

+ +
+  1)  [dy] = [y1] - [y0]
+  2) [adx] = [x1] - [x0]
+  3) [ady] = absolute value of [dy]
+  4) [err] = [ady] * ([X] - [x0])
+  5) [off] = [err] / [adx] using integer division
+  6) if ( [dy] is less than zero ) {
+
+       7) [Y] = [y0] - [off]
+
+     } else {
+
+       8) [Y] = [y0] + [off]
+  
+     }
+
+  9) done
+
+ +

render_line

+ +

Floor decode type one uses the integer line drawing algorithm of +"render_line(x0, y0, x1, y1, v)" to construct an integer floor +curve for contiguous piecewise line segments. Note that it has not +been relevant elsewhere, but here we must define integer division as +rounding division of both positive and negative numbers toward zero.

+ +
+  1)   [dy] = [y1] - [y0]
+  2)  [adx] = [x1] - [x0]
+  3)  [ady] = absolute value of [dy]
+  4) [base] = [dy] / [adx] using integer division
+  5)    [x] = [x0]
+  6)    [y] = [y0]
+  7)  [err] = 0
+
+  8) if ( [dy] is less than 0 ) {
+
+        9) [sy] = [base] - 1
+
+     } else {
+
+       10) [sy] = [base] + 1
+
+     }
+
+ 11) [ady] = [ady] - (absolute value of [base]) * [adx]
+ 12) vector [v] element [x] = [y]
+
+ 13) iterate [x] over the range [x0]+1 ... [x1]-1 {
+
+       14) [err] = [err] + [ady];
+       15) if ( [err] >= [adx] ) {
+
+             15) [err] = [err] - [adx]
+             16)   [y] = [y] + [sy]
+
+           } else {
+
+             17) [y] = [y] + [base]
+   
+           }
+
+       18) vector [v] element [x] = [y]
+
+     }
+
+ + + + + diff --git a/doc/index.html b/doc/index.html index 26de6e1..769c3d1 100644 --- a/doc/index.html +++ b/doc/index.html @@ -1,114 +1,114 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg Vorbis Documentation

- -

Vorbis technical discussion documents

- - -

Ogg Vorbis I specification

- - - -

Ogg Vorbis programming documents

- - - -

Ogg bitstream documentation

- - - - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg Vorbis Documentation

+ +

Vorbis technical discussion documents

+ + +

Ogg Vorbis I specification

+ + + +

Ogg Vorbis programming documents

+ + + +

Ogg bitstream documentation

+ + + + + + + diff --git a/doc/oggstream.html b/doc/oggstream.html index 9d43831..0d12d36 100644 --- a/doc/oggstream.html +++ b/doc/oggstream.html @@ -1,234 +1,234 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg logical and physical bitstream overview

- -

Ogg bitstreams

- -

Ogg codecs use octet vectors of raw, compressed data -(packets). These compressed packets do not have any -high-level structure or boundary information; strung together, they -appear to be streams of random bytes with no landmarks.

- -

Raw packets may be used directly by transport mechanisms that provide -their own framing and packet-separation mechanisms (such as UDP -datagrams). For stream based storage (such as files) and transport -(such as TCP streams or pipes), Vorbis and other future Ogg codecs use -the Ogg bitstream format to provide framing/sync, sync recapture -after error, landmarks during seeking, and enough information to -properly separate data back into packets at the original packet -boundaries without relying on decoding to find packet boundaries.

- -

Logical and physical bitstreams

- -

Raw packets are grouped and encoded into contiguous pages of -structured bitstream data called logical bitstreams. A -logical bitstream consists of pages, in order, belonging to a single -codec instance. Each page is a self contained entity (although it is -possible that a packet may be split and encoded across one or more -pages); that is, the page decode mechanism is designed to recognize, -verify and handle single pages at a time from the overall bitstream.

- -

Multiple logical bitstreams can be combined (with restrictions) into a -single physical bitstream. A physical bitstream consists of -multiple logical bitstreams multiplexed at the page level and may -include a 'meta-header' at the beginning of the multiplexed logical -stream that serves as identification magic. Whole pages are taken in -order from multiple logical bitstreams and combined into a single -physical stream of pages. The decoder reconstructs the original -logical bitstreams from the physical bitstream by taking the pages in -order from the physical bitstream and redirecting them into the -appropriate logical decoding entity. The simplest physical bitstream -is a single, unmultiplexed logical bitstream with no meta-header; this -is referred to as a 'degenerate stream'.

- -

Ogg Logical Bitstream Framing discusses -the page format of an Ogg bitstream, the packet coding process -and logical bitstreams in detail. The remainder of this document -specifies requirements for constructing finished, physical Ogg -bitstreams.

- -

Mapping Restrictions

- -

Logical bitstreams may not be mapped/multiplexed into physical -bitstreams without restriction. Here we discuss design restrictions -on Ogg physical bitstreams in general, mostly to introduce -design rationale. Each 'media' format defines its own (generally more -restrictive) mapping. An 'Ogg Vorbis Audio Bitstream', for example, has a -specific physical bitstream structure. -An 'Ogg A/V' bitstream (not currently specified) will also mandate a -specific, restricted physical bitstream format.

- -

additional end-to-end structure

- -

The framing specification defines -'beginning of stream' and 'end of stream' page markers via a header -flag (it is possible for a stream to consist of a single page). A -stream always consists of an integer number of pages, an easy -requirement given the variable size nature of pages.

- -

In addition to the header flag marking the first and last pages of a -logical bitstream, the first page of an Ogg bitstream obeys -additional restrictions. Each individual media mapping specifies its -own implementation details regarding these restrictions.

- -

The first page of a logical Ogg bitstream consists of a single, -small 'initial header' packet that includes sufficient information to -identify the exact CODEC type and media requirements of the logical -bitstream. The intent of this restriction is to simplify identifying -the bitstream type and content; for a given media type (or across all -Ogg media types) we can know that we only need a small, fixed -amount of data to uniquely identify the bitstream type.

- -

As an example, Ogg Vorbis places the name and revision of the Vorbis -CODEC, the audio rate and the audio quality into this initial header, -thus simplifying vastly the certain identification of an Ogg Vorbis -audio bitstream.

- -

sequential multiplexing (chaining)

- -

The simplest form of logical bitstream multiplexing is concatenation -(chaining). Complete logical bitstreams are strung -one-after-another in order. The bitstreams do not overlap; the final -page of a given logical bitstream is immediately followed by the -initial page of the next. Chaining is the only logical->physical -mapping allowed by Ogg Vorbis.

- -

Each chained logical bitstream must have a unique serial number within -the scope of the physical bitstream.

- -

concurrent multiplexing (grouping)

- -

Logical bitstreams may also be multiplexed 'in parallel' -(grouped). An example of grouping would be to allow -streaming of separate audio and video streams, using different codecs -and different logical bitstreams, in the same physical bitstream. -Whole pages from multiple logical bitstreams are mixed together.

- -

The initial pages of each logical bitstream must appear first; the -media mapping specifies the order of the initial pages. For example, -Ogg A/V will eventually specify an Ogg video bitstream with -audio. The mapping may specify that the physical bitstream must begin -with the initial page of a logical video bitstream, followed by the -initial page of an audio stream. Unlike initial pages, terminal pages -for the logical bitstreams need not all occur contiguously (although a -specific media mapping may require this; it is not mandated by the -generic Ogg stream spec). Terminal pages may be 'nil' pages, -that is, pages containing no content but simply a page header with -position information and the 'last page of bitstream' flag set in the -page header.

- -

Each grouped bitstream must have a unique serial number within the -scope of the physical bitstream.

- -

sequential and concurrent multiplexing

- -

Groups of concurrently multiplexed bitstreams may be chained -consecutively. Such a physical bitstream obeys all the rules of both -grouped and chained multiplexed streams; the groups, when unchained , -must stand on their own as a valid concurrently multiplexed -bitstream.

- -

multiplexing example

- -

Below, we present an example of a grouped and chained bitstream:

- -

stream

- -

In this example, we see pages from five total logical bitstreams -multiplexed into a physical bitstream. Note the following -characteristics:

- -
    -
  1. Grouped bitstreams begin together; all of the initial pages -must appear before any data pages. When concurrently multiplexed -groups are chained, the new group does not begin until all the -bitstreams in the previous group have terminated.
  2. - -
  3. The pages of concurrently multiplexed bitstreams need not conform -to a regular order; the only requirement is that page n of a -logical bitstream follow page n-1 in the physical bitstream. -There are no restrictions on intervening pages belonging to other -logical bitstreams. (Tying page appearance to bitrate demands is one -logical strategy, ie, the page appears at the chronological point -where decode requires more information).
  4. -
- - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg logical and physical bitstream overview

+ +

Ogg bitstreams

+ +

Ogg codecs use octet vectors of raw, compressed data +(packets). These compressed packets do not have any +high-level structure or boundary information; strung together, they +appear to be streams of random bytes with no landmarks.

+ +

Raw packets may be used directly by transport mechanisms that provide +their own framing and packet-separation mechanisms (such as UDP +datagrams). For stream based storage (such as files) and transport +(such as TCP streams or pipes), Vorbis and other future Ogg codecs use +the Ogg bitstream format to provide framing/sync, sync recapture +after error, landmarks during seeking, and enough information to +properly separate data back into packets at the original packet +boundaries without relying on decoding to find packet boundaries.

+ +

Logical and physical bitstreams

+ +

Raw packets are grouped and encoded into contiguous pages of +structured bitstream data called logical bitstreams. A +logical bitstream consists of pages, in order, belonging to a single +codec instance. Each page is a self contained entity (although it is +possible that a packet may be split and encoded across one or more +pages); that is, the page decode mechanism is designed to recognize, +verify and handle single pages at a time from the overall bitstream.

+ +

Multiple logical bitstreams can be combined (with restrictions) into a +single physical bitstream. A physical bitstream consists of +multiple logical bitstreams multiplexed at the page level and may +include a 'meta-header' at the beginning of the multiplexed logical +stream that serves as identification magic. Whole pages are taken in +order from multiple logical bitstreams and combined into a single +physical stream of pages. The decoder reconstructs the original +logical bitstreams from the physical bitstream by taking the pages in +order from the physical bitstream and redirecting them into the +appropriate logical decoding entity. The simplest physical bitstream +is a single, unmultiplexed logical bitstream with no meta-header; this +is referred to as a 'degenerate stream'.

+ +

Ogg Logical Bitstream Framing discusses +the page format of an Ogg bitstream, the packet coding process +and logical bitstreams in detail. The remainder of this document +specifies requirements for constructing finished, physical Ogg +bitstreams.

+ +

Mapping Restrictions

+ +

Logical bitstreams may not be mapped/multiplexed into physical +bitstreams without restriction. Here we discuss design restrictions +on Ogg physical bitstreams in general, mostly to introduce +design rationale. Each 'media' format defines its own (generally more +restrictive) mapping. An 'Ogg Vorbis Audio Bitstream', for example, has a +specific physical bitstream structure. +An 'Ogg A/V' bitstream (not currently specified) will also mandate a +specific, restricted physical bitstream format.

+ +

additional end-to-end structure

+ +

The framing specification defines +'beginning of stream' and 'end of stream' page markers via a header +flag (it is possible for a stream to consist of a single page). A +stream always consists of an integer number of pages, an easy +requirement given the variable size nature of pages.

+ +

In addition to the header flag marking the first and last pages of a +logical bitstream, the first page of an Ogg bitstream obeys +additional restrictions. Each individual media mapping specifies its +own implementation details regarding these restrictions.

+ +

The first page of a logical Ogg bitstream consists of a single, +small 'initial header' packet that includes sufficient information to +identify the exact CODEC type and media requirements of the logical +bitstream. The intent of this restriction is to simplify identifying +the bitstream type and content; for a given media type (or across all +Ogg media types) we can know that we only need a small, fixed +amount of data to uniquely identify the bitstream type.

+ +

As an example, Ogg Vorbis places the name and revision of the Vorbis +CODEC, the audio rate and the audio quality into this initial header, +thus simplifying vastly the certain identification of an Ogg Vorbis +audio bitstream.

+ +

sequential multiplexing (chaining)

+ +

The simplest form of logical bitstream multiplexing is concatenation +(chaining). Complete logical bitstreams are strung +one-after-another in order. The bitstreams do not overlap; the final +page of a given logical bitstream is immediately followed by the +initial page of the next. Chaining is the only logical->physical +mapping allowed by Ogg Vorbis.

+ +

Each chained logical bitstream must have a unique serial number within +the scope of the physical bitstream.

+ +

concurrent multiplexing (grouping)

+ +

Logical bitstreams may also be multiplexed 'in parallel' +(grouped). An example of grouping would be to allow +streaming of separate audio and video streams, using different codecs +and different logical bitstreams, in the same physical bitstream. +Whole pages from multiple logical bitstreams are mixed together.

+ +

The initial pages of each logical bitstream must appear first; the +media mapping specifies the order of the initial pages. For example, +Ogg A/V will eventually specify an Ogg video bitstream with +audio. The mapping may specify that the physical bitstream must begin +with the initial page of a logical video bitstream, followed by the +initial page of an audio stream. Unlike initial pages, terminal pages +for the logical bitstreams need not all occur contiguously (although a +specific media mapping may require this; it is not mandated by the +generic Ogg stream spec). Terminal pages may be 'nil' pages, +that is, pages containing no content but simply a page header with +position information and the 'last page of bitstream' flag set in the +page header.

+ +

Each grouped bitstream must have a unique serial number within the +scope of the physical bitstream.

+ +

sequential and concurrent multiplexing

+ +

Groups of concurrently multiplexed bitstreams may be chained +consecutively. Such a physical bitstream obeys all the rules of both +grouped and chained multiplexed streams; the groups, when unchained , +must stand on their own as a valid concurrently multiplexed +bitstream.

+ +

multiplexing example

+ +

Below, we present an example of a grouped and chained bitstream:

+ +

stream

+ +

In this example, we see pages from five total logical bitstreams +multiplexed into a physical bitstream. Note the following +characteristics:

+ +
    +
  1. Grouped bitstreams begin together; all of the initial pages +must appear before any data pages. When concurrently multiplexed +groups are chained, the new group does not begin until all the +bitstreams in the previous group have terminated.
  2. + +
  3. The pages of concurrently multiplexed bitstreams need not conform +to a regular order; the only requirement is that page n of a +logical bitstream follow page n-1 in the physical bitstream. +There are no restrictions on intervening pages belonging to other +logical bitstreams. (Tying page appearance to bitrate demands is one +logical strategy, ie, the page appears at the chronological point +where decode requires more information).
  4. +
+ + + + + diff --git a/doc/programming.html b/doc/programming.html index 97ddb64..65ccb48 100644 --- a/doc/programming.html +++ b/doc/programming.html @@ -1,554 +1,554 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Programming with Xiph.org libvorbis

- -

Description

- -

Libvorbis is the Xiph.org Foundation's portable Ogg Vorbis CODEC -implemented as a programmatic library. Libvorbis provides primitives -to handle framing and manipulation of Ogg bitstreams (used by the -Vorbis for streaming), a full analysis (encoding) interface as well as -packet decoding and synthesis for playback.

- -

The libvorbis library does not provide any system interface; a -full-featured demonstration player included with the library -distribtion provides example code for a variety of system interfaces -as well as a working example of using libvorbis in production code.

- -

Encoding Overview

- -

Decoding Overview

- -

Decoding a bitstream with libvorbis follows roughly the following -steps:

- -
    -
  1. Frame the incoming bitstream into pages
  2. -
  3. Sort the pages by logical bitstream and buffer then into logical streams
  4. -
  5. Decompose the logical streams into raw packets
  6. -
  7. Reconstruct segments of the original data from each packet
  8. -
  9. Glue the reconstructed segments back into a decoded stream
  10. -
- -

Framing

- -

An Ogg bitstream is logically arranged into pages, but to decode -the pages, we have to find them first. The raw bitstream is first fed -into an ogg_sync_state buffer using ogg_sync_buffer() -and ogg_sync_wrote(). After each block we submit to the sync -buffer, we should check to see if we can frame and extract a complete -page or pages using ogg_sync_pageout(). Extra pages are -buffered; allowing them to build up in the ogg_sync_state -buffer will eventually exhaust memory.

- -

The Ogg pages returned from ogg_sync_pageout need not be -decoded further to be used as landmarks in seeking; seeking can be -either a rough process of simply jumping to approximately intuited -portions of the bitstream, or it can be a precise bisection process -that captures pages and inspects data position. When seeking, -however, sequential multiplexing (chaining) must be accounted for; -beginning play in a new logical bitstream requires initializing a -synthesis engine with the headers from that bitstream. Vorbis -bitstreams do not make use of concurent multiplexing (grouping).

- -

Sorting

- -

The pages produced by ogg_sync_pageout are then sorted by -serial number to seperate logical bitstreams. Initialize logical -bitstream buffers (og_stream_state) using -ogg_stream_init(). Pages are submitted to the matching -logical bitstream buffer using ogg_stream_pagein; the serial -number of the page and the stream buffer must match, or the page will -be rejected. A page submitted out of sequence will simply be noted, -and in the course of outputting packets, the hole will be flagged -(ogg_sync_pageout and ogg_stream_packetout will -return a negative value at positions where they had to recapture the -stream).

- -

Extracting packets

- -

After submitting page[s] to a logical stream, read available packets -using ogg_stream_packetout.

- -

Decoding packets

- -

Reassembling data segments

- -

Ogg Bitstream Manipulation Structures

- -

Two of the Ogg bitstream data structures are intended to be -transparent to the developer; the fields should be used directly.

- -

ogg_packet

- -
-typedef struct {
-  unsigned char *packet;
-  long  bytes;
-  long  b_o_s;
-  long  e_o_s;
-
-  size64 granulepos;
-
-} ogg_packet;
-
- -
-
packet:
-
a pointer to the byte data of the raw packet
-
bytes:
-
the size of the packet' raw data
-
b_o_s:
-
beginning of stream; nonzero if this is the first packet of - the logical bitstream
-
e_o_s:
-
end of stream; nonzero if this is the last packet of the - logical bitstream
-
granulepos:
-
the absolute position of this packet in the original - uncompressed data stream.
-
- -

encoding notes

- -

The encoder is responsible for setting all of -the fields of the packet to appropriate values before submission to -ogg_stream_packetin(); however, it is noted that the value in -b_o_s is ignored; the first page produced from a given -ogg_stream_state structure will be stamped as the initial -page. e_o_s, however, must be set; this is the means by -which the stream encoding primitives handle end of stream and cleanup.

- -

decoding notes

- -

ogg_stream_packetout() sets the fields -to appropriate values. Note that granulepos will be >= 0 only in the -case that the given packet actually represents that position (ie, only -the last packet completed on any page will have a meaningful -granulepos). Intervening frames will see granulepos set -to -1.

- -

ogg_page

- -
-typedef struct {
-  unsigned char *header;
-  long header_len;
-  unsigned char *body;
-  long body_len;
-} ogg_page;
-
- -
-
header:
-
pointer to the page header data
-
header_len:
-
length of the page header in bytes
-
body:
-
pointer to the page body
-
body_len:
-
length of the page body
-
- -

Note that although the header and body pointers do -not necessarily point into a single contiguous page vector, the page -body must immediately follow the header in the bitstream.

- -

Ogg Bitstream Manipulation Functions

- -

-int ogg_page_bos(ogg_page *og); -

- -

Returns the 'beginning of stream' flag for the given Ogg page. The -beginning of stream flag is set on the initial page of a logical -bitstream.

- -

Zero indicates the flag is cleared (this is not the initial page of a -logical bitstream). Nonzero indicates the flag is set (this is the -initial page of a logical bitstream).

- -

-int ogg_page_continued(ogg_page *og); -

- -

Returns the 'packet continued' flag for the given Ogg page. The packet -continued flag indicates whether or not the body data of this page -begins with packet continued from a preceeding page.

- -

Zero (unset) indicates that the body data begins with a new packet. -Nonzero (set) indicates that the first packet data on the page is a -continuation from the preceeding page.

- -

-int ogg_page_eos(ogg_page *og); -

- -

Returns the 'end of stream' flag for a give Ogg page. The end of page -flag is set on the last (terminal) page of a logical bitstream.

- -

Zero (unset) indicates that this is not the last page of a logical -bitstream. Nonzero (set) indicates that this is the last page of a -logical bitstream and that no addiitonal pages belonging to this -bitstream may follow.

- -

-size64 ogg_page_granulepos(ogg_page *og); -

- -

Returns the position of this page as an absolute position within the -original uncompressed data. The position, as returned, is 'frames -encoded to date up to and including the last whole packet on this -page'. Partial packets begun on this page but continued to the -following page are not included. If no packet ends on this page, the -frame position value will be equal to the frame position value of the -preceeding page. If none of the original uncompressed data is yet -represented in the logical bitstream (for example, the first page of a -bitstream consists only of a header packet; this packet encodes only -metadata), the value shall be zero.

- -

The units of the framenumber are determined by media mapping. A -vorbis audio bitstream, for example, defines one frame to be the -channel values from a single sampling period (eg, a 16 bit stereo -bitstream consists of two samples of two bytes for a total of four -bytes, thus a frame would be four bytes). A video stream defines one -frame to be a single frame of video.

- -

-int ogg_page_pageno(ogg_page *og); -

- -

Returns the sequential page number of the given Ogg page. The first -page in a logical bitstream is numbered zero; following pages are -numbered in increasing monotonic order.

- -

-int ogg_page_serialno(ogg_page *og); -

- -

Returns the serial number of the given Ogg page. The serial number is -used as a handle to distinguish various logical bitstreams in a -physical Ogg bitstresm. Every logical bitstream within a -physical bitstream must use a unique (within the scope of the physical -bitstream) serial number, which is stamped on all bitstream pages.

- -

-int ogg_page_version(ogg_page *og); -

- -

Returns the revision of the Ogg bitstream structure of the given page. -Currently, the only permitted number is zero. Later revisions of the -bitstream spec will increment this version should any changes be -incompatable.

- -

-int ogg_stream_clear(ogg_stream_state *os); -

- -

Clears and deallocates the internal storage of the given Ogg stream. -After clearing, the stream structure is not initialized for use; -ogg_stream_init must be called to reinitialize for use. -Use ogg_stream_reset to reset the stream state -to a fresh, intiialized state.

- -

ogg_stream_clear does not call free() on the pointer -os, allowing use of this call on stream structures in static -or automatic storage. ogg_stream_destroyis a complimentary -function that frees the pointer as well.

- -

Returns zero on success and non-zero on failure. This function always -succeeds.

- -

-int ogg_stream_destroy(ogg_stream_state *os); -

- -

Clears and deallocates the internal storage of the given Ogg stream, -then frees the storage associated with the pointer os.

- -

ogg_stream_clear does not call free() on the pointer -os, allowing use of that call on stream structures in static -or automatic storage.

- -

Returns zero on success and non-zero on failure. This function always -succeeds.

- -

-int ogg_stream_init(ogg_stream_state *os,int serialno); -

- -

Initialize the storage associated with os for use as an Ogg -stream. This call is used to initialize a stream for both encode and -decode. The given serial number is the serial number that will be -stamped on pages of the produced bitstream (during encode), or used as -a check that pages match (during decode).

- -

Returns zero on success, nonzero on failure.

- -

-int ogg_stream_packetin(ogg_stream_state *os, ogg_packet *op); -

- -

Used during encoding to add the given raw packet to the given Ogg -bitstream. The contents of op are copied; -ogg_stream_packetin does not retain any pointers into -op's storage. The encoding proccess buffers incoming packets -until enough packets have been assembled to form an entire page; -ogg_stream_pageout is used to read complete pages.

- -

Returns zero on success, nonzero on failure.

- -

-int ogg_stream_packetout(ogg_stream_state *os,ogg_packet *op); -

- -

Used during decoding to read raw packets from the given logical -bitstream. ogg_stream_packetout will only return complete -packets for which checksumming indicates no corruption. The size and -contents of the packet exactly match those given in the encoding -process.

- -

Returns zero if the next packet is not ready to be read (not buffered -or incomplete), positive if it returned a complete packet in -op and negative if there is a gap, extra bytes or corruption -at this position in the bitstream (essentially that the bitstream had -to be recaptured). A negative value is not necessarily an error. It -would be a common occurence when seeking, for example, which requires -recapture of the bitstream at the position decoding continued.

- -

If the return value is positive, ogg_stream_packetout placed -a packet in op. The data in op points to static -storage that is valid until the next call to -ogg_stream_pagein, ogg_stream_clear, -ogg_stream_reset, or ogg_stream_destroy. The -pointers are not invalidated by more calls to -ogg_stream_packetout.

- -

-int ogg_stream_pagein(ogg_stream_state *os, ogg_page *og); -

- -

Used during decoding to buffer the given complete, pre-verified page -for decoding into raw Ogg packets. The given page must be framed, -normally produced by ogg_sync_pageout, and from the logical -bitstream associated with os (the serial numbers must match). -The contents of the given page are copied; ogg_stream_pagein -retains no pointers into og storage.

- -

Returns zero on success and non-zero on failure.

- -

-int ogg_stream_pageout(ogg_stream_state *os, ogg_page *og); -

- -

Used during encode to read complete pages from the stream buffer. The -returned page is ready for sending out to the real world.

- -

Returns zero if there is no complete page ready for reading. Returns -nonzero when it has placed data for a complete page into -og. Note that the storage returned in og points into internal -storage; the pointers in og are valid until the next call to -ogg_stream_pageout, ogg_stream_packetin, -ogg_stream_reset, ogg_stream_clear or -ogg_stream_destroy.

- -

-int ogg_stream_reset(ogg_stream_state *os); -

- -

Resets the given stream's state to that of a blank, unused stream; -this may be used during encode or decode.

- -

Note that if used during encode, it does not alter the stream's serial -number. In addition, the next page produced during encoding will be -marked as the 'initial' page of the logical bitstream.

- -

When used during decode, this simply clears the data buffer of any -pending pages. Beginning and end of stream cues are read from the -bitstream and are unaffected by reset.

- -

Returns zero on success and non-zero on failure. This function always -succeeds.

- -

-char *ogg_sync_buffer(ogg_sync_state *oy, long size); -

- -

This call is used to buffer a raw bitstream for framing and -verification. ogg_sync_buffer handles stream capture and -recapture, checksumming, and division into Ogg pages (as required by -ogg_stream_pagein).

- -

ogg_sync_buffer exposes a buffer area into which the decoder -copies the next (up to) size bytes. We expose the buffer -(rather than taking a buffer) in order to avoid an extra copy many -uses; this way, for example, read() can transfer data -directly into the stream buffer without first needing to place it in -temporary storage.

- -

Returns a pointer into oy's internal bitstream sync buffer; -the remaining space in the sync buffer is at least size -bytes. The decoder need not write all of size bytes; -ogg_sync_wrote is used to inform the engine how many bytes -were actually written. Use of ogg_sync_wrote after writing -into the exposed buffer is mandantory.

- -

-int ogg_sync_clear(ogg_sync_state *oy); -

- -

ogg_sync_clear -clears and deallocates the internal storage of the given Ogg sync -buffer. After clearing, the sync structure is not initialized for -use; ogg_sync_init must be called to reinitialize for use. -Use ogg_sync_reset to reset the sync state and buffer to a -fresh, intiialized state.

- -

ogg_sync_clear does not call free() on the pointer -oy, allowing use of this call on sync structures in static -or automatic storage. ogg_sync_destroyis a complimentary -function that frees the pointer as well.

- -

Returns zero on success and non-zero on failure. This function always -succeeds.

- -

-int ogg_sync_destroy(ogg_sync_state *oy); -

- -

Clears and deallocates the internal storage of the given Ogg sync -buffer, then frees the storage associated with the pointer -oy.

- -

ogg_sync_clear does not call free() on the pointer -oy, allowing use of that call on stream structures in static -or automatic storage.

- -

Returns zero on success and non-zero on failure. This function always -succeeds.

- -

-int ogg_sync_init(ogg_sync_state *oy); -

- -

Initializes the sync buffer oy for use.

- -

Returns zero on success and non-zero on failure. This function always -succeeds.

- -

-int ogg_sync_pageout(ogg_sync_state *oy, ogg_page *og); -

- -

Reads complete, framed, verified Ogg pages from the sync buffer, -placing the page data in og.

- -

Returns zero when there's no complete pages buffered for -retrieval. Returns negative when a loss of sync or recapture occurred -(this is not necessarily an error; recapture would be required after -seeking, for example). Returns positive when a page is returned in -og. Note that the data in og points into the sync -buffer storage; the pointers are valid until the next call to -ogg_sync_buffer, ogg_sync_clear, -ogg_sync_destroy or ogg_sync_reset.

- -

-int ogg_sync_reset(ogg_sync_state *oy); -

- -

ogg_sync_reset resets the sync state in oy to a -clean, empty state. This is useful, for example, when seeking to a -new location in a bitstream.

- -

Returns zero on success, nonzero on failure.

- -

-int ogg_sync_wrote(ogg_sync_state *oy, long bytes); -

- -

Used to inform the sync state as to how many bytes were actually -written into the exposed sync buffer. It must be equal to or less -than the size of the buffer requested.

- -

Returns zero on success and non-zero on failure; failure occurs only -when the number of bytes written were larger than the buffer.

- - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Programming with Xiph.org libvorbis

+ +

Description

+ +

Libvorbis is the Xiph.org Foundation's portable Ogg Vorbis CODEC +implemented as a programmatic library. Libvorbis provides primitives +to handle framing and manipulation of Ogg bitstreams (used by the +Vorbis for streaming), a full analysis (encoding) interface as well as +packet decoding and synthesis for playback.

+ +

The libvorbis library does not provide any system interface; a +full-featured demonstration player included with the library +distribtion provides example code for a variety of system interfaces +as well as a working example of using libvorbis in production code.

+ +

Encoding Overview

+ +

Decoding Overview

+ +

Decoding a bitstream with libvorbis follows roughly the following +steps:

+ +
    +
  1. Frame the incoming bitstream into pages
  2. +
  3. Sort the pages by logical bitstream and buffer then into logical streams
  4. +
  5. Decompose the logical streams into raw packets
  6. +
  7. Reconstruct segments of the original data from each packet
  8. +
  9. Glue the reconstructed segments back into a decoded stream
  10. +
+ +

Framing

+ +

An Ogg bitstream is logically arranged into pages, but to decode +the pages, we have to find them first. The raw bitstream is first fed +into an ogg_sync_state buffer using ogg_sync_buffer() +and ogg_sync_wrote(). After each block we submit to the sync +buffer, we should check to see if we can frame and extract a complete +page or pages using ogg_sync_pageout(). Extra pages are +buffered; allowing them to build up in the ogg_sync_state +buffer will eventually exhaust memory.

+ +

The Ogg pages returned from ogg_sync_pageout need not be +decoded further to be used as landmarks in seeking; seeking can be +either a rough process of simply jumping to approximately intuited +portions of the bitstream, or it can be a precise bisection process +that captures pages and inspects data position. When seeking, +however, sequential multiplexing (chaining) must be accounted for; +beginning play in a new logical bitstream requires initializing a +synthesis engine with the headers from that bitstream. Vorbis +bitstreams do not make use of concurent multiplexing (grouping).

+ +

Sorting

+ +

The pages produced by ogg_sync_pageout are then sorted by +serial number to seperate logical bitstreams. Initialize logical +bitstream buffers (og_stream_state) using +ogg_stream_init(). Pages are submitted to the matching +logical bitstream buffer using ogg_stream_pagein; the serial +number of the page and the stream buffer must match, or the page will +be rejected. A page submitted out of sequence will simply be noted, +and in the course of outputting packets, the hole will be flagged +(ogg_sync_pageout and ogg_stream_packetout will +return a negative value at positions where they had to recapture the +stream).

+ +

Extracting packets

+ +

After submitting page[s] to a logical stream, read available packets +using ogg_stream_packetout.

+ +

Decoding packets

+ +

Reassembling data segments

+ +

Ogg Bitstream Manipulation Structures

+ +

Two of the Ogg bitstream data structures are intended to be +transparent to the developer; the fields should be used directly.

+ +

ogg_packet

+ +
+typedef struct {
+  unsigned char *packet;
+  long  bytes;
+  long  b_o_s;
+  long  e_o_s;
+
+  size64 granulepos;
+
+} ogg_packet;
+
+ +
+
packet:
+
a pointer to the byte data of the raw packet
+
bytes:
+
the size of the packet' raw data
+
b_o_s:
+
beginning of stream; nonzero if this is the first packet of + the logical bitstream
+
e_o_s:
+
end of stream; nonzero if this is the last packet of the + logical bitstream
+
granulepos:
+
the absolute position of this packet in the original + uncompressed data stream.
+
+ +

encoding notes

+ +

The encoder is responsible for setting all of +the fields of the packet to appropriate values before submission to +ogg_stream_packetin(); however, it is noted that the value in +b_o_s is ignored; the first page produced from a given +ogg_stream_state structure will be stamped as the initial +page. e_o_s, however, must be set; this is the means by +which the stream encoding primitives handle end of stream and cleanup.

+ +

decoding notes

+ +

ogg_stream_packetout() sets the fields +to appropriate values. Note that granulepos will be >= 0 only in the +case that the given packet actually represents that position (ie, only +the last packet completed on any page will have a meaningful +granulepos). Intervening frames will see granulepos set +to -1.

+ +

ogg_page

+ +
+typedef struct {
+  unsigned char *header;
+  long header_len;
+  unsigned char *body;
+  long body_len;
+} ogg_page;
+
+ +
+
header:
+
pointer to the page header data
+
header_len:
+
length of the page header in bytes
+
body:
+
pointer to the page body
+
body_len:
+
length of the page body
+
+ +

Note that although the header and body pointers do +not necessarily point into a single contiguous page vector, the page +body must immediately follow the header in the bitstream.

+ +

Ogg Bitstream Manipulation Functions

+ +

+int ogg_page_bos(ogg_page *og); +

+ +

Returns the 'beginning of stream' flag for the given Ogg page. The +beginning of stream flag is set on the initial page of a logical +bitstream.

+ +

Zero indicates the flag is cleared (this is not the initial page of a +logical bitstream). Nonzero indicates the flag is set (this is the +initial page of a logical bitstream).

+ +

+int ogg_page_continued(ogg_page *og); +

+ +

Returns the 'packet continued' flag for the given Ogg page. The packet +continued flag indicates whether or not the body data of this page +begins with packet continued from a preceeding page.

+ +

Zero (unset) indicates that the body data begins with a new packet. +Nonzero (set) indicates that the first packet data on the page is a +continuation from the preceeding page.

+ +

+int ogg_page_eos(ogg_page *og); +

+ +

Returns the 'end of stream' flag for a give Ogg page. The end of page +flag is set on the last (terminal) page of a logical bitstream.

+ +

Zero (unset) indicates that this is not the last page of a logical +bitstream. Nonzero (set) indicates that this is the last page of a +logical bitstream and that no addiitonal pages belonging to this +bitstream may follow.

+ +

+size64 ogg_page_granulepos(ogg_page *og); +

+ +

Returns the position of this page as an absolute position within the +original uncompressed data. The position, as returned, is 'frames +encoded to date up to and including the last whole packet on this +page'. Partial packets begun on this page but continued to the +following page are not included. If no packet ends on this page, the +frame position value will be equal to the frame position value of the +preceeding page. If none of the original uncompressed data is yet +represented in the logical bitstream (for example, the first page of a +bitstream consists only of a header packet; this packet encodes only +metadata), the value shall be zero.

+ +

The units of the framenumber are determined by media mapping. A +vorbis audio bitstream, for example, defines one frame to be the +channel values from a single sampling period (eg, a 16 bit stereo +bitstream consists of two samples of two bytes for a total of four +bytes, thus a frame would be four bytes). A video stream defines one +frame to be a single frame of video.

+ +

+int ogg_page_pageno(ogg_page *og); +

+ +

Returns the sequential page number of the given Ogg page. The first +page in a logical bitstream is numbered zero; following pages are +numbered in increasing monotonic order.

+ +

+int ogg_page_serialno(ogg_page *og); +

+ +

Returns the serial number of the given Ogg page. The serial number is +used as a handle to distinguish various logical bitstreams in a +physical Ogg bitstresm. Every logical bitstream within a +physical bitstream must use a unique (within the scope of the physical +bitstream) serial number, which is stamped on all bitstream pages.

+ +

+int ogg_page_version(ogg_page *og); +

+ +

Returns the revision of the Ogg bitstream structure of the given page. +Currently, the only permitted number is zero. Later revisions of the +bitstream spec will increment this version should any changes be +incompatable.

+ +

+int ogg_stream_clear(ogg_stream_state *os); +

+ +

Clears and deallocates the internal storage of the given Ogg stream. +After clearing, the stream structure is not initialized for use; +ogg_stream_init must be called to reinitialize for use. +Use ogg_stream_reset to reset the stream state +to a fresh, intiialized state.

+ +

ogg_stream_clear does not call free() on the pointer +os, allowing use of this call on stream structures in static +or automatic storage. ogg_stream_destroyis a complimentary +function that frees the pointer as well.

+ +

Returns zero on success and non-zero on failure. This function always +succeeds.

+ +

+int ogg_stream_destroy(ogg_stream_state *os); +

+ +

Clears and deallocates the internal storage of the given Ogg stream, +then frees the storage associated with the pointer os.

+ +

ogg_stream_clear does not call free() on the pointer +os, allowing use of that call on stream structures in static +or automatic storage.

+ +

Returns zero on success and non-zero on failure. This function always +succeeds.

+ +

+int ogg_stream_init(ogg_stream_state *os,int serialno); +

+ +

Initialize the storage associated with os for use as an Ogg +stream. This call is used to initialize a stream for both encode and +decode. The given serial number is the serial number that will be +stamped on pages of the produced bitstream (during encode), or used as +a check that pages match (during decode).

+ +

Returns zero on success, nonzero on failure.

+ +

+int ogg_stream_packetin(ogg_stream_state *os, ogg_packet *op); +

+ +

Used during encoding to add the given raw packet to the given Ogg +bitstream. The contents of op are copied; +ogg_stream_packetin does not retain any pointers into +op's storage. The encoding proccess buffers incoming packets +until enough packets have been assembled to form an entire page; +ogg_stream_pageout is used to read complete pages.

+ +

Returns zero on success, nonzero on failure.

+ +

+int ogg_stream_packetout(ogg_stream_state *os,ogg_packet *op); +

+ +

Used during decoding to read raw packets from the given logical +bitstream. ogg_stream_packetout will only return complete +packets for which checksumming indicates no corruption. The size and +contents of the packet exactly match those given in the encoding +process.

+ +

Returns zero if the next packet is not ready to be read (not buffered +or incomplete), positive if it returned a complete packet in +op and negative if there is a gap, extra bytes or corruption +at this position in the bitstream (essentially that the bitstream had +to be recaptured). A negative value is not necessarily an error. It +would be a common occurence when seeking, for example, which requires +recapture of the bitstream at the position decoding continued.

+ +

If the return value is positive, ogg_stream_packetout placed +a packet in op. The data in op points to static +storage that is valid until the next call to +ogg_stream_pagein, ogg_stream_clear, +ogg_stream_reset, or ogg_stream_destroy. The +pointers are not invalidated by more calls to +ogg_stream_packetout.

+ +

+int ogg_stream_pagein(ogg_stream_state *os, ogg_page *og); +

+ +

Used during decoding to buffer the given complete, pre-verified page +for decoding into raw Ogg packets. The given page must be framed, +normally produced by ogg_sync_pageout, and from the logical +bitstream associated with os (the serial numbers must match). +The contents of the given page are copied; ogg_stream_pagein +retains no pointers into og storage.

+ +

Returns zero on success and non-zero on failure.

+ +

+int ogg_stream_pageout(ogg_stream_state *os, ogg_page *og); +

+ +

Used during encode to read complete pages from the stream buffer. The +returned page is ready for sending out to the real world.

+ +

Returns zero if there is no complete page ready for reading. Returns +nonzero when it has placed data for a complete page into +og. Note that the storage returned in og points into internal +storage; the pointers in og are valid until the next call to +ogg_stream_pageout, ogg_stream_packetin, +ogg_stream_reset, ogg_stream_clear or +ogg_stream_destroy.

+ +

+int ogg_stream_reset(ogg_stream_state *os); +

+ +

Resets the given stream's state to that of a blank, unused stream; +this may be used during encode or decode.

+ +

Note that if used during encode, it does not alter the stream's serial +number. In addition, the next page produced during encoding will be +marked as the 'initial' page of the logical bitstream.

+ +

When used during decode, this simply clears the data buffer of any +pending pages. Beginning and end of stream cues are read from the +bitstream and are unaffected by reset.

+ +

Returns zero on success and non-zero on failure. This function always +succeeds.

+ +

+char *ogg_sync_buffer(ogg_sync_state *oy, long size); +

+ +

This call is used to buffer a raw bitstream for framing and +verification. ogg_sync_buffer handles stream capture and +recapture, checksumming, and division into Ogg pages (as required by +ogg_stream_pagein).

+ +

ogg_sync_buffer exposes a buffer area into which the decoder +copies the next (up to) size bytes. We expose the buffer +(rather than taking a buffer) in order to avoid an extra copy many +uses; this way, for example, read() can transfer data +directly into the stream buffer without first needing to place it in +temporary storage.

+ +

Returns a pointer into oy's internal bitstream sync buffer; +the remaining space in the sync buffer is at least size +bytes. The decoder need not write all of size bytes; +ogg_sync_wrote is used to inform the engine how many bytes +were actually written. Use of ogg_sync_wrote after writing +into the exposed buffer is mandantory.

+ +

+int ogg_sync_clear(ogg_sync_state *oy); +

+ +

ogg_sync_clear +clears and deallocates the internal storage of the given Ogg sync +buffer. After clearing, the sync structure is not initialized for +use; ogg_sync_init must be called to reinitialize for use. +Use ogg_sync_reset to reset the sync state and buffer to a +fresh, intiialized state.

+ +

ogg_sync_clear does not call free() on the pointer +oy, allowing use of this call on sync structures in static +or automatic storage. ogg_sync_destroyis a complimentary +function that frees the pointer as well.

+ +

Returns zero on success and non-zero on failure. This function always +succeeds.

+ +

+int ogg_sync_destroy(ogg_sync_state *oy); +

+ +

Clears and deallocates the internal storage of the given Ogg sync +buffer, then frees the storage associated with the pointer +oy.

+ +

ogg_sync_clear does not call free() on the pointer +oy, allowing use of that call on stream structures in static +or automatic storage.

+ +

Returns zero on success and non-zero on failure. This function always +succeeds.

+ +

+int ogg_sync_init(ogg_sync_state *oy); +

+ +

Initializes the sync buffer oy for use.

+ +

Returns zero on success and non-zero on failure. This function always +succeeds.

+ +

+int ogg_sync_pageout(ogg_sync_state *oy, ogg_page *og); +

+ +

Reads complete, framed, verified Ogg pages from the sync buffer, +placing the page data in og.

+ +

Returns zero when there's no complete pages buffered for +retrieval. Returns negative when a loss of sync or recapture occurred +(this is not necessarily an error; recapture would be required after +seeking, for example). Returns positive when a page is returned in +og. Note that the data in og points into the sync +buffer storage; the pointers are valid until the next call to +ogg_sync_buffer, ogg_sync_clear, +ogg_sync_destroy or ogg_sync_reset.

+ +

+int ogg_sync_reset(ogg_sync_state *oy); +

+ +

ogg_sync_reset resets the sync state in oy to a +clean, empty state. This is useful, for example, when seeking to a +new location in a bitstream.

+ +

Returns zero on success, nonzero on failure.

+ +

+int ogg_sync_wrote(ogg_sync_state *oy, long bytes); +

+ +

Used to inform the sync state as to how many bytes were actually +written into the exposed sync buffer. It must be equal to or less +than the size of the buffer requested.

+ +

Returns zero on success and non-zero on failure; failure occurs only +when the number of bytes written were larger than the buffer.

+ + + + + diff --git a/doc/stereo.html b/doc/stereo.html index 5e525cf..0412202 100644 --- a/doc/stereo.html +++ b/doc/stereo.html @@ -1,418 +1,418 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg Vorbis stereo-specific channel coupling discussion

- -

Abstract

- -

The Vorbis audio CODEC provides a channel coupling -mechanisms designed to reduce effective bitrate by both eliminating -interchannel redundancy and eliminating stereo image information -labeled inaudible or undesirable according to spatial psychoacoustic -models. This document describes both the mechanical coupling -mechanisms available within the Vorbis specification, as well as the -specific stereo coupling models used by the reference -libvorbis codec provided by xiph.org.

- -

Mechanisms

- -

In encoder release beta 4 and earlier, Vorbis supported multiple -channel encoding, but the channels were encoded entirely separately -with no cross-analysis or redundancy elimination between channels. -This multichannel strategy is very similar to the mp3's dual -stereo mode and Vorbis uses the same name for its analogous -uncoupled multichannel modes.

- -

However, the Vorbis spec provides for, and Vorbis release 1.0 rc1 and -later implement a coupled channel strategy. Vorbis has two specific -mechanisms that may be used alone or in conjunction to implement -channel coupling. The first is channel interleaving via -residue backend type 2, and the second is square polar -mapping. These two general mechanisms are particularly well -suited to coupling due to the structure of Vorbis encoding, as we'll -explore below, and using both we can implement both totally -lossless stereo image coupling [bit-for-bit decode-identical -to uncoupled modes], as well as various lossy models that seek to -eliminate inaudible or unimportant aspects of the stereo image in -order to enhance bitrate. The exact coupling implementation is -generalized to allow the encoder a great deal of flexibility in -implementation of a stereo or surround model without requiring any -significant complexity increase over the combinatorially simpler -mid/side joint stereo of mp3 and other current audio codecs.

- -

A particular Vorbis bitstream may apply channel coupling directly to -more than a pair of channels; polar mapping is hierarchical such that -polar coupling may be extrapolated to an arbitrary number of channels -and is not restricted to only stereo, quadraphonics, ambisonics or 5.1 -surround. However, the scope of this document restricts itself to the -stereo coupling case.

- -

Square Polar Mapping

- -

maximal correlation

- -

Recall that the basic structure of a a Vorbis I stream first generates -from input audio a spectral 'floor' function that serves as an -MDCT-domain whitening filter. This floor is meant to represent the -rough envelope of the frequency spectrum, using whatever metric the -encoder cares to define. This floor is subtracted from the log -frequency spectrum, effectively normalizing the spectrum by frequency. -Each input channel is associated with a unique floor function.

- -

The basic idea behind any stereo coupling is that the left and right -channels usually correlate. This correlation is even stronger if one -first accounts for energy differences in any given frequency band -across left and right; think for example of individual instruments -mixed into different portions of the stereo image, or a stereo -recording with a dominant feature not perfectly in the center. The -floor functions, each specific to a channel, provide the perfect means -of normalizing left and right energies across the spectrum to maximize -correlation before coupling. This feature of the Vorbis format is not -a convenient accident.

- -

Because we strive to maximally correlate the left and right channels -and generally succeed in doing so, left and right residue is typically -nearly identical. We could use channel interleaving (discussed below) -alone to efficiently remove the redundancy between the left and right -channels as a side effect of entropy encoding, but a polar -representation gives benefits when left/right correlation is -strong.

- -

point and diffuse imaging

- -

The first advantage of a polar representation is that it effectively -separates the spatial audio information into a 'point image' -(magnitude) at a given frequency and located somewhere in the sound -field, and a 'diffuse image' (angle) that fills a large amount of -space simultaneously. Even if we preserve only the magnitude (point) -data, a detailed and carefully chosen floor function in each channel -provides us with a free, fine-grained, frequency relative intensity -stereo*. Angle information represents diffuse sound fields, such as -reverberation that fills the entire space simultaneously.

- -

*Because the Vorbis model supports a number of different possible -stereo models and these models may be mixed, we do not use the term -'intensity stereo' talking about Vorbis; instead we use the terms -'point stereo', 'phase stereo' and subcategories of each.

- -

The majority of a stereo image is representable by polar magnitude -alone, as strong sounds tend to be produced at near-point sources; -even non-diffuse, fast, sharp echoes track very accurately using -magnitude representation almost alone (for those experimenting with -Vorbis tuning, this strategy works much better with the precise, -piecewise control of floor 1; the continuous approximation of floor 0 -results in unstable imaging). Reverberation and diffuse sounds tend -to contain less energy and be psychoacoustically dominated by the -point sources embedded in them. Thus, we again tend to concentrate -more represented energy into a predictably smaller number of numbers. -Separating representation of point and diffuse imaging also allows us -to model and manipulate point and diffuse qualities separately.

- -

controlling bit leakage and symbol crosstalk

- -

Because polar -representation concentrates represented energy into fewer large -values, we reduce bit 'leakage' during cascading (multistage VQ -encoding) as a secondary benefit. A single large, monolithic VQ -codebook is more efficient than a cascaded book due to entropy -'crosstalk' among symbols between different stages of a multistage cascade. -Polar representation is a way of further concentrating entropy into -predictable locations so that codebook design can take steps to -improve multistage codebook efficiency. It also allows us to cascade -various elements of the stereo image independently.

- -

eliminating trigonometry and rounding

- -

Rounding and computational complexity are potential problems with a -polar representation. As our encoding process involves quantization, -mixing a polar representation and quantization makes it potentially -impossible, depending on implementation, to construct a coupled stereo -mechanism that results in bit-identical decompressed output compared -to an uncoupled encoding should the encoder desire it.

- -

Vorbis uses a mapping that preserves the most useful qualities of -polar representation, relies only on addition/subtraction (during -decode; high quality encoding still requires some trig), and makes it -trivial before or after quantization to represent an angle/magnitude -through a one-to-one mapping from possible left/right value -permutations. We do this by basing our polar representation on the -unit square rather than the unit-circle.

- -

Given a magnitude and angle, we recover left and right using the -following function (note that A/B may be left/right or right/left -depending on the coupling definition used by the encoder):

- -
-      if(magnitude>0)
-        if(angle>0){
-          A=magnitude;
-          B=magnitude-angle;
-        }else{
-          B=magnitude;
-          A=magnitude+angle;
-        }
-      else
-        if(angle>0){
-          A=magnitude;
-          B=magnitude+angle;
-        }else{
-          B=magnitude;
-          A=magnitude-angle;
-        }
-    }
-
- -

The function is antisymmetric for positive and negative magnitudes in -order to eliminate a redundant value when quantizing. For example, if -we're quantizing to integer values, we can visualize a magnitude of 5 -and an angle of -2 as follows:

- -

square polar

- -

This representation loses or replicates no values; if the range of A -and B are integral -5 through 5, the number of possible Cartesian -permutations is 121. Represented in square polar notation, the -possible values are:

- -
- 0, 0
-
--1,-2  -1,-1  -1, 0  -1, 1
-
- 1,-2   1,-1   1, 0   1, 1
-
--2,-4  -2,-3  -2,-2  -2,-1  -2, 0  -2, 1  -2, 2  -2, 3  
-
- 2,-4   2,-3   ... following the pattern ...
-
- ...   5, 1   5, 2   5, 3   5, 4   5, 5   5, 6   5, 7   5, 8   5, 9
-
-
- -

...for a grand total of 121 possible values, the same number as in -Cartesian representation (note that, for example, 5,-10 is -the same as -5,10, so there's no reason to represent -both. 2,10 cannot happen, and there's no reason to account for it.) -It's also obvious that this mapping is exactly reversible.

- -

Channel interleaving

- -

We can remap and A/B vector using polar mapping into a magnitude/angle -vector, and it's clear that, in general, this concentrates energy in -the magnitude vector and reduces the amount of information to encode -in the angle vector. Encoding these vectors independently with -residue backend #0 or residue backend #1 will result in bitrate -savings. However, there are still implicit correlations between the -magnitude and angle vectors. The most obvious is that the amplitude -of the angle is bounded by its corresponding magnitude value.

- -

Entropy coding the results, then, further benefits from the entropy -model being able to compress magnitude and angle simultaneously. For -this reason, Vorbis implements residue backend #2 which pre-interleaves -a number of input vectors (in the stereo case, two, A and B) into a -single output vector (with the elements in the order of -A_0, B_0, A_1, B_1, A_2 ... A_n-1, B_n-1) before entropy encoding. Thus -each vector to be coded by the vector quantization backend consists of -matching magnitude and angle values.

- -

The astute reader, at this point, will notice that in the theoretical -case in which we can use monolithic codebooks of arbitrarily large -size, we can directly interleave and encode left and right without -polar mapping; in fact, the polar mapping does not appear to lend any -benefit whatsoever to the efficiency of the entropy coding. In fact, -it is perfectly possible and reasonable to build a Vorbis encoder that -dispenses with polar mapping entirely and merely interleaves the -channel. Libvorbis based encoders may configure such an encoding and -it will work as intended.

- -

However, when we leave the ideal/theoretical domain, we notice that -polar mapping does give additional practical benefits, as discussed in -the above section on polar mapping and summarized again here:

- - - -

Stereo Models

- -

Dual Stereo

- -

Dual stereo refers to stereo encoding where the channels are entirely -separate; they are analyzed and encoded as entirely distinct entities. -This terminology is familiar from mp3.

- -

Lossless Stereo

- -

Using polar mapping and/or channel interleaving, it's possible to -couple Vorbis channels losslessly, that is, construct a stereo -coupling encoding that both saves space but also decodes -bit-identically to dual stereo. OggEnc 1.0 and later uses this -mode in all high-bitrate encoding.

- -

Overall, this stereo mode is overkill; however, it offers a safe -alternative to users concerned about the slightest possible -degradation to the stereo image or archival quality audio.

- -

Phase Stereo

- -

Phase stereo is the least aggressive means of gracefully dropping -resolution from the stereo image; it affects only diffuse imaging.

- -

It's often quoted that the human ear is deaf to signal phase above -about 4kHz; this is nearly true and a passable rule of thumb, but it -can be demonstrated that even an average user can tell the difference -between high frequency in-phase and out-of-phase noise. Obviously -then, the statement is not entirely true. However, it's also the case -that one must resort to nearly such an extreme demonstration before -finding the counterexample.

- -

'Phase stereo' is simply a more aggressive quantization of the polar -angle vector; above 4kHz it's generally quite safe to quantize noise -and noisy elements to only a handful of allowed phases, or to thin the -phase with respect to the magnitude. The phases of high amplitude -pure tones may or may not be preserved more carefully (they are -relatively rare and L/R tend to be in phase, so there is generally -little reason not to spend a few more bits on them)

- -

example: eight phase stereo

- -

Vorbis may implement phase stereo coupling by preserving the entirety -of the magnitude vector (essential to fine amplitude and energy -resolution overall) and quantizing the angle vector to one of only -four possible values. Given that the magnitude vector may be positive -or negative, this results in left and right phase having eight -possible permutation, thus 'eight phase stereo':

- -

eight phase

- -

Left and right may be in phase (positive or negative), the most common -case by far, or out of phase by 90 or 180 degrees.

- -

example: four phase stereo

- -

Similarly, four phase stereo takes the quantization one step further; -it allows only in-phase and 180 degree out-out-phase signals:

- -

four phase

- -

example: point stereo

- -

Point stereo eliminates the possibility of out-of-phase signal -entirely. Any diffuse quality to a sound source tends to collapse -inward to a point somewhere within the stereo image. A practical -example would be balanced reverberations within a large, live space; -normally the sound is diffuse and soft, giving a sonic impression of -volume. In point-stereo, the reverberations would still exist, but -sound fairly firmly centered within the image (assuming the -reverberation was centered overall; if the reverberation is stronger -to the left, then the point of localization in point stereo would be -to the left). This effect is most noticeable at low and mid -frequencies and using headphones (which grant perfect stereo -separation). Point stereo is is a graceful but generally easy to -detect degradation to the sound quality and is thus used in frequency -ranges where it is least noticeable.

- -

Mixed Stereo

- -

Mixed stereo is the simultaneous use of more than one of the above -stereo encoding models, generally using more aggressive modes in -higher frequencies, lower amplitudes or 'nearly' in-phase sound.

- -

It is also the case that near-DC frequencies should be encoded using -lossless coupling to avoid frame blocking artifacts.

- -

Vorbis Stereo Modes

- -

Vorbis, as of 1.0, uses lossless stereo and a number of mixed modes -constructed out of lossless and point stereo. Phase stereo was used -in the rc2 encoder, but is not currently used for simplicity's sake. It -will likely be re-added to the stereo model in the future.

- - - - - - - - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg Vorbis stereo-specific channel coupling discussion

+ +

Abstract

+ +

The Vorbis audio CODEC provides a channel coupling +mechanisms designed to reduce effective bitrate by both eliminating +interchannel redundancy and eliminating stereo image information +labeled inaudible or undesirable according to spatial psychoacoustic +models. This document describes both the mechanical coupling +mechanisms available within the Vorbis specification, as well as the +specific stereo coupling models used by the reference +libvorbis codec provided by xiph.org.

+ +

Mechanisms

+ +

In encoder release beta 4 and earlier, Vorbis supported multiple +channel encoding, but the channels were encoded entirely separately +with no cross-analysis or redundancy elimination between channels. +This multichannel strategy is very similar to the mp3's dual +stereo mode and Vorbis uses the same name for its analogous +uncoupled multichannel modes.

+ +

However, the Vorbis spec provides for, and Vorbis release 1.0 rc1 and +later implement a coupled channel strategy. Vorbis has two specific +mechanisms that may be used alone or in conjunction to implement +channel coupling. The first is channel interleaving via +residue backend type 2, and the second is square polar +mapping. These two general mechanisms are particularly well +suited to coupling due to the structure of Vorbis encoding, as we'll +explore below, and using both we can implement both totally +lossless stereo image coupling [bit-for-bit decode-identical +to uncoupled modes], as well as various lossy models that seek to +eliminate inaudible or unimportant aspects of the stereo image in +order to enhance bitrate. The exact coupling implementation is +generalized to allow the encoder a great deal of flexibility in +implementation of a stereo or surround model without requiring any +significant complexity increase over the combinatorially simpler +mid/side joint stereo of mp3 and other current audio codecs.

+ +

A particular Vorbis bitstream may apply channel coupling directly to +more than a pair of channels; polar mapping is hierarchical such that +polar coupling may be extrapolated to an arbitrary number of channels +and is not restricted to only stereo, quadraphonics, ambisonics or 5.1 +surround. However, the scope of this document restricts itself to the +stereo coupling case.

+ +

Square Polar Mapping

+ +

maximal correlation

+ +

Recall that the basic structure of a a Vorbis I stream first generates +from input audio a spectral 'floor' function that serves as an +MDCT-domain whitening filter. This floor is meant to represent the +rough envelope of the frequency spectrum, using whatever metric the +encoder cares to define. This floor is subtracted from the log +frequency spectrum, effectively normalizing the spectrum by frequency. +Each input channel is associated with a unique floor function.

+ +

The basic idea behind any stereo coupling is that the left and right +channels usually correlate. This correlation is even stronger if one +first accounts for energy differences in any given frequency band +across left and right; think for example of individual instruments +mixed into different portions of the stereo image, or a stereo +recording with a dominant feature not perfectly in the center. The +floor functions, each specific to a channel, provide the perfect means +of normalizing left and right energies across the spectrum to maximize +correlation before coupling. This feature of the Vorbis format is not +a convenient accident.

+ +

Because we strive to maximally correlate the left and right channels +and generally succeed in doing so, left and right residue is typically +nearly identical. We could use channel interleaving (discussed below) +alone to efficiently remove the redundancy between the left and right +channels as a side effect of entropy encoding, but a polar +representation gives benefits when left/right correlation is +strong.

+ +

point and diffuse imaging

+ +

The first advantage of a polar representation is that it effectively +separates the spatial audio information into a 'point image' +(magnitude) at a given frequency and located somewhere in the sound +field, and a 'diffuse image' (angle) that fills a large amount of +space simultaneously. Even if we preserve only the magnitude (point) +data, a detailed and carefully chosen floor function in each channel +provides us with a free, fine-grained, frequency relative intensity +stereo*. Angle information represents diffuse sound fields, such as +reverberation that fills the entire space simultaneously.

+ +

*Because the Vorbis model supports a number of different possible +stereo models and these models may be mixed, we do not use the term +'intensity stereo' talking about Vorbis; instead we use the terms +'point stereo', 'phase stereo' and subcategories of each.

+ +

The majority of a stereo image is representable by polar magnitude +alone, as strong sounds tend to be produced at near-point sources; +even non-diffuse, fast, sharp echoes track very accurately using +magnitude representation almost alone (for those experimenting with +Vorbis tuning, this strategy works much better with the precise, +piecewise control of floor 1; the continuous approximation of floor 0 +results in unstable imaging). Reverberation and diffuse sounds tend +to contain less energy and be psychoacoustically dominated by the +point sources embedded in them. Thus, we again tend to concentrate +more represented energy into a predictably smaller number of numbers. +Separating representation of point and diffuse imaging also allows us +to model and manipulate point and diffuse qualities separately.

+ +

controlling bit leakage and symbol crosstalk

+ +

Because polar +representation concentrates represented energy into fewer large +values, we reduce bit 'leakage' during cascading (multistage VQ +encoding) as a secondary benefit. A single large, monolithic VQ +codebook is more efficient than a cascaded book due to entropy +'crosstalk' among symbols between different stages of a multistage cascade. +Polar representation is a way of further concentrating entropy into +predictable locations so that codebook design can take steps to +improve multistage codebook efficiency. It also allows us to cascade +various elements of the stereo image independently.

+ +

eliminating trigonometry and rounding

+ +

Rounding and computational complexity are potential problems with a +polar representation. As our encoding process involves quantization, +mixing a polar representation and quantization makes it potentially +impossible, depending on implementation, to construct a coupled stereo +mechanism that results in bit-identical decompressed output compared +to an uncoupled encoding should the encoder desire it.

+ +

Vorbis uses a mapping that preserves the most useful qualities of +polar representation, relies only on addition/subtraction (during +decode; high quality encoding still requires some trig), and makes it +trivial before or after quantization to represent an angle/magnitude +through a one-to-one mapping from possible left/right value +permutations. We do this by basing our polar representation on the +unit square rather than the unit-circle.

+ +

Given a magnitude and angle, we recover left and right using the +following function (note that A/B may be left/right or right/left +depending on the coupling definition used by the encoder):

+ +
+      if(magnitude>0)
+        if(angle>0){
+          A=magnitude;
+          B=magnitude-angle;
+        }else{
+          B=magnitude;
+          A=magnitude+angle;
+        }
+      else
+        if(angle>0){
+          A=magnitude;
+          B=magnitude+angle;
+        }else{
+          B=magnitude;
+          A=magnitude-angle;
+        }
+    }
+
+ +

The function is antisymmetric for positive and negative magnitudes in +order to eliminate a redundant value when quantizing. For example, if +we're quantizing to integer values, we can visualize a magnitude of 5 +and an angle of -2 as follows:

+ +

square polar

+ +

This representation loses or replicates no values; if the range of A +and B are integral -5 through 5, the number of possible Cartesian +permutations is 121. Represented in square polar notation, the +possible values are:

+ +
+ 0, 0
+
+-1,-2  -1,-1  -1, 0  -1, 1
+
+ 1,-2   1,-1   1, 0   1, 1
+
+-2,-4  -2,-3  -2,-2  -2,-1  -2, 0  -2, 1  -2, 2  -2, 3  
+
+ 2,-4   2,-3   ... following the pattern ...
+
+ ...   5, 1   5, 2   5, 3   5, 4   5, 5   5, 6   5, 7   5, 8   5, 9
+
+
+ +

...for a grand total of 121 possible values, the same number as in +Cartesian representation (note that, for example, 5,-10 is +the same as -5,10, so there's no reason to represent +both. 2,10 cannot happen, and there's no reason to account for it.) +It's also obvious that this mapping is exactly reversible.

+ +

Channel interleaving

+ +

We can remap and A/B vector using polar mapping into a magnitude/angle +vector, and it's clear that, in general, this concentrates energy in +the magnitude vector and reduces the amount of information to encode +in the angle vector. Encoding these vectors independently with +residue backend #0 or residue backend #1 will result in bitrate +savings. However, there are still implicit correlations between the +magnitude and angle vectors. The most obvious is that the amplitude +of the angle is bounded by its corresponding magnitude value.

+ +

Entropy coding the results, then, further benefits from the entropy +model being able to compress magnitude and angle simultaneously. For +this reason, Vorbis implements residue backend #2 which pre-interleaves +a number of input vectors (in the stereo case, two, A and B) into a +single output vector (with the elements in the order of +A_0, B_0, A_1, B_1, A_2 ... A_n-1, B_n-1) before entropy encoding. Thus +each vector to be coded by the vector quantization backend consists of +matching magnitude and angle values.

+ +

The astute reader, at this point, will notice that in the theoretical +case in which we can use monolithic codebooks of arbitrarily large +size, we can directly interleave and encode left and right without +polar mapping; in fact, the polar mapping does not appear to lend any +benefit whatsoever to the efficiency of the entropy coding. In fact, +it is perfectly possible and reasonable to build a Vorbis encoder that +dispenses with polar mapping entirely and merely interleaves the +channel. Libvorbis based encoders may configure such an encoding and +it will work as intended.

+ +

However, when we leave the ideal/theoretical domain, we notice that +polar mapping does give additional practical benefits, as discussed in +the above section on polar mapping and summarized again here:

+ + + +

Stereo Models

+ +

Dual Stereo

+ +

Dual stereo refers to stereo encoding where the channels are entirely +separate; they are analyzed and encoded as entirely distinct entities. +This terminology is familiar from mp3.

+ +

Lossless Stereo

+ +

Using polar mapping and/or channel interleaving, it's possible to +couple Vorbis channels losslessly, that is, construct a stereo +coupling encoding that both saves space but also decodes +bit-identically to dual stereo. OggEnc 1.0 and later uses this +mode in all high-bitrate encoding.

+ +

Overall, this stereo mode is overkill; however, it offers a safe +alternative to users concerned about the slightest possible +degradation to the stereo image or archival quality audio.

+ +

Phase Stereo

+ +

Phase stereo is the least aggressive means of gracefully dropping +resolution from the stereo image; it affects only diffuse imaging.

+ +

It's often quoted that the human ear is deaf to signal phase above +about 4kHz; this is nearly true and a passable rule of thumb, but it +can be demonstrated that even an average user can tell the difference +between high frequency in-phase and out-of-phase noise. Obviously +then, the statement is not entirely true. However, it's also the case +that one must resort to nearly such an extreme demonstration before +finding the counterexample.

+ +

'Phase stereo' is simply a more aggressive quantization of the polar +angle vector; above 4kHz it's generally quite safe to quantize noise +and noisy elements to only a handful of allowed phases, or to thin the +phase with respect to the magnitude. The phases of high amplitude +pure tones may or may not be preserved more carefully (they are +relatively rare and L/R tend to be in phase, so there is generally +little reason not to spend a few more bits on them)

+ +

example: eight phase stereo

+ +

Vorbis may implement phase stereo coupling by preserving the entirety +of the magnitude vector (essential to fine amplitude and energy +resolution overall) and quantizing the angle vector to one of only +four possible values. Given that the magnitude vector may be positive +or negative, this results in left and right phase having eight +possible permutation, thus 'eight phase stereo':

+ +

eight phase

+ +

Left and right may be in phase (positive or negative), the most common +case by far, or out of phase by 90 or 180 degrees.

+ +

example: four phase stereo

+ +

Similarly, four phase stereo takes the quantization one step further; +it allows only in-phase and 180 degree out-out-phase signals:

+ +

four phase

+ +

example: point stereo

+ +

Point stereo eliminates the possibility of out-of-phase signal +entirely. Any diffuse quality to a sound source tends to collapse +inward to a point somewhere within the stereo image. A practical +example would be balanced reverberations within a large, live space; +normally the sound is diffuse and soft, giving a sonic impression of +volume. In point-stereo, the reverberations would still exist, but +sound fairly firmly centered within the image (assuming the +reverberation was centered overall; if the reverberation is stronger +to the left, then the point of localization in point stereo would be +to the left). This effect is most noticeable at low and mid +frequencies and using headphones (which grant perfect stereo +separation). Point stereo is is a graceful but generally easy to +detect degradation to the sound quality and is thus used in frequency +ranges where it is least noticeable.

+ +

Mixed Stereo

+ +

Mixed stereo is the simultaneous use of more than one of the above +stereo encoding models, generally using more aggressive modes in +higher frequencies, lower amplitudes or 'nearly' in-phase sound.

+ +

It is also the case that near-DC frequencies should be encoded using +lossless coupling to avoid frame blocking artifacts.

+ +

Vorbis Stereo Modes

+ +

Vorbis, as of 1.0, uses lossless stereo and a number of mixed modes +constructed out of lossless and point stereo. Phase stereo was used +in the rc2 encoder, but is not currently used for simplicity's sake. It +will likely be re-added to the stereo model in the future.

+ + + + + + + + + + + diff --git a/doc/v-comment.html b/doc/v-comment.html index e654004..1cd3a44 100644 --- a/doc/v-comment.html +++ b/doc/v-comment.html @@ -1,285 +1,285 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg Vorbis I format specification: comment field and header specification

- -

Overview

- -

The Vorbis text comment header is the second (of three) header -packets that begin a Vorbis bitstream. It is meant for short, text -comments, not arbitrary metadata; arbitrary metadata belongs in a -separate logical bitstream (usually an XML stream type) that provides -greater structure and machine parseability.

- -

The comment field is meant to be used much like someone jotting a -quick note on the bottom of a CDR. It should be a little information to -remember the disc by and explain it to others; a short, to-the-point -text note that need not only be a couple words, but isn't going to be -more than a short paragraph. The essentials, in other words, whatever -they turn out to be, eg:

- -

-"Honest Bob and the Factory-to-Dealer-Incentives, _I'm Still Around_, -opening for Moxy Früvous, 1997" -

- -

Comment encoding

- -

Structure

- -

The comment header logically is a list of eight-bit-clean vectors; the -number of vectors is bounded to 2^32-1 and the length of each vector -is limited to 2^32-1 bytes. The vector length is encoded; the vector -contents themselves are not null terminated. In addition to the vector -list, there is a single vector for vendor name (also 8 bit clean, -length encoded in 32 bits). For example, the 1.0 release of libvorbis -set the vendor string to "Xiph.Org libVorbis I 20020717".

- -

The comment header is decoded as follows:

- -
-  1) [vendor_length] = read an unsigned integer of 32 bits
-  2) [vendor_string] = read a UTF-8 vector as [vendor_length] octets
-  3) [user_comment_list_length] = read an unsigned integer of 32 bits
-  4) iterate [user_comment_list_length] times {
-
-       5) [length] = read an unsigned integer of 32 bits
-       6) this iteration's user comment = read a UTF-8 vector as [length] octets
-
-     }
-
-  7) [framing_bit] = read a single bit as boolean
-  8) if ( [framing_bit] unset or end of packet ) then ERROR
-  9) done.
-
- -

Content vector format

- -

The comment vectors are structured similarly to a UNIX environment variable. -That is, comment fields consist of a field name and a corresponding value and -look like:

- -
-comment[0]="ARTIST=me"; 
-comment[1]="TITLE=the sound of Vorbis"; 
-
- - - -

Field names

- -

Below is a proposed, minimal list of standard field names with a -description of intended use. No single or group of field names is -mandatory; a comment header may contain one, all or none of the names -in this list.

- -
- -
TITLE
-
Track/Work name
- -
VERSION
-
The version field may be used to differentiate multiple -versions of the same track title in a single collection. -(e.g. remix info)
- -
ALBUM
-
The collection name to which this track belongs
- -
TRACKNUMBER
-
The track number of this piece if part of a specific larger collection or album
- -
ARTIST
-
The artist generally considered responsible for the work. In popular music -this is usually the performing band or singer. For classical music it would be -the composer. For an audio book it would be the author of the original text.
- -
PERFORMER
-
The artist(s) who performed the work. In classical music this would be the -conductor, orchestra, soloists. In an audio book it would be the actor who did -the reading. In popular music this is typically the same as the ARTIST and -is omitted.
- -
COPYRIGHT
-
Copyright attribution, e.g., '2001 Nobody's Band' or '1999 Jack Moffitt'
- -
LICENSE
-
License information, eg, 'All Rights Reserved', 'Any -Use Permitted', a URL to a license such as a Creative Commons license -("www.creativecommons.org/blahblah/license.html") or the EFF Open -Audio License ('distributed under the terms of the Open Audio -License. see http://www.eff.org/IP/Open_licenses/eff_oal.html for -details'), etc.
- -
ORGANIZATION
-
Name of the organization producing the track (i.e. -the 'record label')
- -
DESCRIPTION
-
A short text description of the contents
- -
GENRE
-
A short text indication of music genre
- -
DATE
-
Date the track was recorded
- -
LOCATION
-
Location where track was recorded
- -
CONTACT
-
Contact information for the creators or distributors of the track. -This could be a URL, an email address, the physical address of -the producing label.
- -
ISRC
-
ISRC number for the track; see the -ISRC intro page for more information on ISRC numbers.
- -
- -

Implications

- - - -

Encoding

- -

The comment header comprises the entirety of the second bitstream -header packet. Unlike the first bitstream header packet, it is not -generally the only packet on the second page and may not be restricted -to within the second bitstream page. The length of the comment header -packet is (practically) unbounded. The comment header packet is not -optional; it must be present in the bitstream even if it is -effectively empty.

- -

The comment header is encoded as follows (as per Ogg's standard -bitstream mapping which renders least-significant-bit of the word to be -coded into the least significant available bit of the current -bitstream octet first):

- -
    -
  1. Vendor string length (32 bit unsigned quantity specifying number of octets)
  2. -
  3. Vendor string ([vendor string length] octets coded from beginning of string -to end of string, not null terminated)
  4. -
  5. Number of comment fields (32 bit unsigned quantity specifying number of fields)
  6. -
  7. Comment field 0 length (if [Number of comment fields]>0; 32 bit unsigned -quantity specifying number of octets)
  8. -
  9. Comment field 0 ([Comment field 0 length] octets coded from beginning of -string to end of string, not null terminated)
  10. -
  11. Comment field 1 length (if [Number of comment fields]>1...)...
  12. -
- -

This is actually somewhat easier to describe in code; implementation of the above -can be found in vorbis/lib/info.c:_vorbis_pack_comment(),_vorbis_unpack_comment()

- - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg Vorbis I format specification: comment field and header specification

+ +

Overview

+ +

The Vorbis text comment header is the second (of three) header +packets that begin a Vorbis bitstream. It is meant for short, text +comments, not arbitrary metadata; arbitrary metadata belongs in a +separate logical bitstream (usually an XML stream type) that provides +greater structure and machine parseability.

+ +

The comment field is meant to be used much like someone jotting a +quick note on the bottom of a CDR. It should be a little information to +remember the disc by and explain it to others; a short, to-the-point +text note that need not only be a couple words, but isn't going to be +more than a short paragraph. The essentials, in other words, whatever +they turn out to be, eg:

+ +

+"Honest Bob and the Factory-to-Dealer-Incentives, _I'm Still Around_, +opening for Moxy Früvous, 1997" +

+ +

Comment encoding

+ +

Structure

+ +

The comment header logically is a list of eight-bit-clean vectors; the +number of vectors is bounded to 2^32-1 and the length of each vector +is limited to 2^32-1 bytes. The vector length is encoded; the vector +contents themselves are not null terminated. In addition to the vector +list, there is a single vector for vendor name (also 8 bit clean, +length encoded in 32 bits). For example, the 1.0 release of libvorbis +set the vendor string to "Xiph.Org libVorbis I 20020717".

+ +

The comment header is decoded as follows:

+ +
+  1) [vendor_length] = read an unsigned integer of 32 bits
+  2) [vendor_string] = read a UTF-8 vector as [vendor_length] octets
+  3) [user_comment_list_length] = read an unsigned integer of 32 bits
+  4) iterate [user_comment_list_length] times {
+
+       5) [length] = read an unsigned integer of 32 bits
+       6) this iteration's user comment = read a UTF-8 vector as [length] octets
+
+     }
+
+  7) [framing_bit] = read a single bit as boolean
+  8) if ( [framing_bit] unset or end of packet ) then ERROR
+  9) done.
+
+ +

Content vector format

+ +

The comment vectors are structured similarly to a UNIX environment variable. +That is, comment fields consist of a field name and a corresponding value and +look like:

+ +
+comment[0]="ARTIST=me"; 
+comment[1]="TITLE=the sound of Vorbis"; 
+
+ + + +

Field names

+ +

Below is a proposed, minimal list of standard field names with a +description of intended use. No single or group of field names is +mandatory; a comment header may contain one, all or none of the names +in this list.

+ +
+ +
TITLE
+
Track/Work name
+ +
VERSION
+
The version field may be used to differentiate multiple +versions of the same track title in a single collection. +(e.g. remix info)
+ +
ALBUM
+
The collection name to which this track belongs
+ +
TRACKNUMBER
+
The track number of this piece if part of a specific larger collection or album
+ +
ARTIST
+
The artist generally considered responsible for the work. In popular music +this is usually the performing band or singer. For classical music it would be +the composer. For an audio book it would be the author of the original text.
+ +
PERFORMER
+
The artist(s) who performed the work. In classical music this would be the +conductor, orchestra, soloists. In an audio book it would be the actor who did +the reading. In popular music this is typically the same as the ARTIST and +is omitted.
+ +
COPYRIGHT
+
Copyright attribution, e.g., '2001 Nobody's Band' or '1999 Jack Moffitt'
+ +
LICENSE
+
License information, eg, 'All Rights Reserved', 'Any +Use Permitted', a URL to a license such as a Creative Commons license +("www.creativecommons.org/blahblah/license.html") or the EFF Open +Audio License ('distributed under the terms of the Open Audio +License. see http://www.eff.org/IP/Open_licenses/eff_oal.html for +details'), etc.
+ +
ORGANIZATION
+
Name of the organization producing the track (i.e. +the 'record label')
+ +
DESCRIPTION
+
A short text description of the contents
+ +
GENRE
+
A short text indication of music genre
+ +
DATE
+
Date the track was recorded
+ +
LOCATION
+
Location where track was recorded
+ +
CONTACT
+
Contact information for the creators or distributors of the track. +This could be a URL, an email address, the physical address of +the producing label.
+ +
ISRC
+
ISRC number for the track; see the +ISRC intro page for more information on ISRC numbers.
+ +
+ +

Implications

+ + + +

Encoding

+ +

The comment header comprises the entirety of the second bitstream +header packet. Unlike the first bitstream header packet, it is not +generally the only packet on the second page and may not be restricted +to within the second bitstream page. The length of the comment header +packet is (practically) unbounded. The comment header packet is not +optional; it must be present in the bitstream even if it is +effectively empty.

+ +

The comment header is encoded as follows (as per Ogg's standard +bitstream mapping which renders least-significant-bit of the word to be +coded into the least significant available bit of the current +bitstream octet first):

+ +
    +
  1. Vendor string length (32 bit unsigned quantity specifying number of octets)
  2. +
  3. Vendor string ([vendor string length] octets coded from beginning of string +to end of string, not null terminated)
  4. +
  5. Number of comment fields (32 bit unsigned quantity specifying number of fields)
  6. +
  7. Comment field 0 length (if [Number of comment fields]>0; 32 bit unsigned +quantity specifying number of octets)
  8. +
  9. Comment field 0 ([Comment field 0 length] octets coded from beginning of +string to end of string, not null terminated)
  10. +
  11. Comment field 1 length (if [Number of comment fields]>1...)...
  12. +
+ +

This is actually somewhat easier to describe in code; implementation of the above +can be found in vorbis/lib/info.c:_vorbis_pack_comment(),_vorbis_unpack_comment()

+ + + + + diff --git a/doc/vorbis-fidelity.html b/doc/vorbis-fidelity.html index 0d88088..c62f355 100644 --- a/doc/vorbis-fidelity.html +++ b/doc/vorbis-fidelity.html @@ -1,180 +1,180 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg Vorbis: Fidelity measurement and terminology discussion

- -

Terminology discussed in this document is based on common terminology -associated with contemporary codecs such as MPEG I audio layer 3 -(mp3). However, some differences in terminology are useful in the -context of Vorbis as Vorbis functions somewhat differently than most -current formats. For clarity, then, we describe a common terminology -for discussion of Vorbis's and other formats' audio quality.

- -

Subjective and Objective

- -

Objective fidelity is a measure, based on a computable, -mechanical metric, of how carefully an output matches an input. For -example, a stereo amplifier may claim to introduce less that .01% -total harmonic distortion when amplifying an input signal; this claim -is easy to verify given proper equipment, and any number of testers are -likely to arrive at the same, exact results. One need not listen to -the equipment to make this measurement.

- -

However, given two amplifiers with identical, verifiable objective -specifications, listeners may strongly prefer the sound quality of one -over the other. This is actually the case in the decades old debate -[some would say jihad] among audiophiles involving vacuum tube versus -solid state amplifiers. There are people who can tell the difference, -and strongly prefer one over the other despite seemingly identical, -measurable quality. This preference is subjective and -difficult to measure but nonetheless real.

- -

Individual elements of subjective differences often can be qualified, -but overall subjective quality generally is not measurable. Different -observers are likely to disagree on the exact results of a subjective -test as each observer's perspective differs. When measuring -subjective qualities, the best one can hope for is average, empirical -results that show statistical significance across a group.

- -

Perceptual codecs are most concerned with subjective, not objective, -quality. This is why evaluating a perceptual codec via distortion -measures and sonograms alone is useless; these objective measures may -provide insight into the quality or functioning of a codec, but cannot -answer the much squishier subjective question, "Does it sound -good?". The tube amplifier example is perhaps not the best as very few -people can hear, or care to hear, the minute differences between tubes -and transistors, whereas the subjective differences in perceptual -codecs tend to be quite large even when objective differences are -not.

- -

Fidelity, Artifacts and Differences

- -

Audio artifacts and loss of fidelity or more simply -put, audio differences are not the same thing.

- -

A loss of fidelity implies differences between the perceived input and -output signal; it does not necessarily imply that the differences in -output are displeasing or that the output sounds poor (although this -is often the case). Tube amplifiers are not higher fidelity -than modern solid state and digital systems. They simply produce a -form of distortion and coloring that is either unnoticeable or actually -pleasing to many ears.

- -

As compared to an original signal using hard metrics, all perceptual -codecs [ASPEC, ATRAC, MP3, WMA, AAC, TwinVQ, AC3 and Vorbis included] -lose objective fidelity in order to reduce bitrate. This is fact. The -idea is to lose fidelity in ways that cannot be perceived. However, -most current streaming applications demand bitrates lower than what -can be achieved by sacrificing only objective fidelity; this is also -fact, despite whatever various company press releases might claim. -Subjective fidelity eventually must suffer in one way or another.

- -

The goal is to choose the best possible tradeoff such that the -fidelity loss is graceful and not obviously noticeable. Most listeners -of FM radio do not realize how much lower fidelity that medium is as -compared to compact discs or DAT. However, when compared directly to -source material, the difference is obvious. A cassette tape is lower -fidelity still, and yet the degradation, relatively speaking, is -graceful and generally easy not to notice. Compare this graceful loss -of quality to an average 44.1kHz stereo mp3 encoded at 80 or 96kbps. -The mp3 might actually be higher objective fidelity but subjectively -sounds much worse.

- -

Thus, when a CODEC must sacrifice subjective quality in order -to satisfy a user's requirements, the result should be a -difference that is generally either difficult to notice -without comparison, or easy to ignore. An artifact, on the -other hand, is an element introduced into the output that is -immediately noticeable, obviously foreign, and undesired. The famous -'underwater' or 'twinkling' effect synonymous with low bitrate (or -poorly encoded) mp3 is an example of an artifact. This -working definition differs slightly from common usage, but the coined -distinction between differences and artifacts is useful for our -discussion.

- -

The goal, when it is absolutely necessary to sacrifice subjective -fidelity, is obviously to strive for differences and not artifacts. -The vast majority of codecs today fail at this task miserably, -predictably, and regularly in one way or another. Avoiding such -failures when it is necessary to sacrifice subjective quality is a -fundamental design objective of Vorbis and that objective is reflected -in Vorbis's design and tuning.

- - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg Vorbis: Fidelity measurement and terminology discussion

+ +

Terminology discussed in this document is based on common terminology +associated with contemporary codecs such as MPEG I audio layer 3 +(mp3). However, some differences in terminology are useful in the +context of Vorbis as Vorbis functions somewhat differently than most +current formats. For clarity, then, we describe a common terminology +for discussion of Vorbis's and other formats' audio quality.

+ +

Subjective and Objective

+ +

Objective fidelity is a measure, based on a computable, +mechanical metric, of how carefully an output matches an input. For +example, a stereo amplifier may claim to introduce less that .01% +total harmonic distortion when amplifying an input signal; this claim +is easy to verify given proper equipment, and any number of testers are +likely to arrive at the same, exact results. One need not listen to +the equipment to make this measurement.

+ +

However, given two amplifiers with identical, verifiable objective +specifications, listeners may strongly prefer the sound quality of one +over the other. This is actually the case in the decades old debate +[some would say jihad] among audiophiles involving vacuum tube versus +solid state amplifiers. There are people who can tell the difference, +and strongly prefer one over the other despite seemingly identical, +measurable quality. This preference is subjective and +difficult to measure but nonetheless real.

+ +

Individual elements of subjective differences often can be qualified, +but overall subjective quality generally is not measurable. Different +observers are likely to disagree on the exact results of a subjective +test as each observer's perspective differs. When measuring +subjective qualities, the best one can hope for is average, empirical +results that show statistical significance across a group.

+ +

Perceptual codecs are most concerned with subjective, not objective, +quality. This is why evaluating a perceptual codec via distortion +measures and sonograms alone is useless; these objective measures may +provide insight into the quality or functioning of a codec, but cannot +answer the much squishier subjective question, "Does it sound +good?". The tube amplifier example is perhaps not the best as very few +people can hear, or care to hear, the minute differences between tubes +and transistors, whereas the subjective differences in perceptual +codecs tend to be quite large even when objective differences are +not.

+ +

Fidelity, Artifacts and Differences

+ +

Audio artifacts and loss of fidelity or more simply +put, audio differences are not the same thing.

+ +

A loss of fidelity implies differences between the perceived input and +output signal; it does not necessarily imply that the differences in +output are displeasing or that the output sounds poor (although this +is often the case). Tube amplifiers are not higher fidelity +than modern solid state and digital systems. They simply produce a +form of distortion and coloring that is either unnoticeable or actually +pleasing to many ears.

+ +

As compared to an original signal using hard metrics, all perceptual +codecs [ASPEC, ATRAC, MP3, WMA, AAC, TwinVQ, AC3 and Vorbis included] +lose objective fidelity in order to reduce bitrate. This is fact. The +idea is to lose fidelity in ways that cannot be perceived. However, +most current streaming applications demand bitrates lower than what +can be achieved by sacrificing only objective fidelity; this is also +fact, despite whatever various company press releases might claim. +Subjective fidelity eventually must suffer in one way or another.

+ +

The goal is to choose the best possible tradeoff such that the +fidelity loss is graceful and not obviously noticeable. Most listeners +of FM radio do not realize how much lower fidelity that medium is as +compared to compact discs or DAT. However, when compared directly to +source material, the difference is obvious. A cassette tape is lower +fidelity still, and yet the degradation, relatively speaking, is +graceful and generally easy not to notice. Compare this graceful loss +of quality to an average 44.1kHz stereo mp3 encoded at 80 or 96kbps. +The mp3 might actually be higher objective fidelity but subjectively +sounds much worse.

+ +

Thus, when a CODEC must sacrifice subjective quality in order +to satisfy a user's requirements, the result should be a +difference that is generally either difficult to notice +without comparison, or easy to ignore. An artifact, on the +other hand, is an element introduced into the output that is +immediately noticeable, obviously foreign, and undesired. The famous +'underwater' or 'twinkling' effect synonymous with low bitrate (or +poorly encoded) mp3 is an example of an artifact. This +working definition differs slightly from common usage, but the coined +distinction between differences and artifacts is useful for our +discussion.

+ +

The goal, when it is absolutely necessary to sacrifice subjective +fidelity, is obviously to strive for differences and not artifacts. +The vast majority of codecs today fail at this task miserably, +predictably, and regularly in one way or another. Avoiding such +failures when it is necessary to sacrifice subjective quality is a +fundamental design objective of Vorbis and that objective is reflected +in Vorbis's design and tuning.

+ + + + + diff --git a/doc/vorbis.html b/doc/vorbis.html index 92e4373..66df2f4 100644 --- a/doc/vorbis.html +++ b/doc/vorbis.html @@ -1,234 +1,234 @@ - - - - - -Ogg Vorbis Documentation - - - - - - - - - -

Ogg Vorbis encoding format documentation

- -

waitAs of writing, not all the below document -links are live. They will be populated as we complete the documents.

- -

Documents

- - - - - - - - - -

Description

- -

Ogg Vorbis is a general purpose compressed audio format -for high quality (44.1-48.0kHz, 16+ bit, polyphonic) audio and music -at moderate fixed and variable bitrates (40-80 kb/s/channel). This -places Vorbis in the same class as audio representations including -MPEG-1 audio layer 3, MPEG-4 audio (AAC and TwinVQ), and PAC.

- -

Vorbis is the first of a planned family of Ogg multimedia coding -formats being developed as part of the Xiph.org Foundation's Ogg multimedia -project. See http://www.xiph.org/ -for more information.

- -

Vorbis technical documents

- -

A Vorbis encoder takes in overlapping (but contiguous) short-time -segments of audio data. The encoder analyzes the content of the audio -to determine an optimal compact representation; this phase of encoding -is known as analysis. For each short-time block of sound, -the encoder then packs an efficient representation of the signal, as -determined by analysis, into a raw packet much smaller than the size -required by the original signal; this phase is coding. -Lastly, in a streaming environment, the raw packets are then -structured into a continuous stream of octets; this last phase is -streaming. Note that the stream of octets is referred to both -as a 'byte-' and 'bit-'stream; the latter usage is acceptible as the -stream of octets is a physical representation of a true logical -bit-by-bit stream.

- -

A Vorbis decoder performs a mirror image process of extracting the -original sequence of raw packets from an Ogg stream (stream -decomposition), reconstructing the signal representation from the -raw data in the packet (decoding) and them reconstituting an -audio signal from the decoded representation (synthesis).

- -

The Programming with libvorbis -documents discuss use of the reference Vorbis codec library -(libvorbis) produced by the Xiph.org Foundation.

- -

The data representations and algorithms necessary at each step to -encode and decode Ogg Vorbis bitstreams are described by the below -documents in sufficient detail to construct a complete Vorbis codec. -Note that at the time of writing, Vorbis is still in a 'Request For -Comments' stage of development; despite being in advanced stages of -development, input from the multimedia community is welcome.

- -

Vorbis analysis and synthesis

- -

Analysis begins by seperating an input audio stream into individual, -overlapping short-time segments of audio data. These segments are -then transformed into an alternate representation, seeking to -represent the original signal in a more efficient form that codes into -a smaller number of bytes. The analysis and transformation stage is -the most complex element of producing a Vorbis bitstream.

- -

The corresponding synthesis step in the decoder is simpler; there is -no analysis to perform, merely a mechanical, deterministic -reconstruction of the original audio data from the transform-domain -representation.

- - - -

Vorbis coding and decoding

- -

Coding and decoding converts the transform-domain representation of -the original audio produced by analysis to and from a bitwise packed -raw data packet. Coding and decoding consist of two logically -orthogonal concepts, back-end coding and bitpacking.

- -

Back-end coding uses a probability model to represent the raw numbers -of the audio representation in as few physical bits as possible; -familiar examples of back-end coding include Huffman coding and Vector -Quantization.

- -

Bitpacking arranges the variable sized words of the back-end -coding into a vector of octets without wasting space. The octets -produced by coding a single short-time audio segment is one raw Vorbis -packet.

- - - -

Vorbis streaming and stream decomposition

- -

Vorbis packets contain the raw, bitwise-compressed representation of a -snippet of audio. These packets contain no structure and cannot be -strung together directly into a stream; for streamed transmission and -storage, Vorbis packets are encoded into an Ogg bitstream.

- - - - - - - + + + + + +Ogg Vorbis Documentation + + + + + + + + + +

Ogg Vorbis encoding format documentation

+ +

waitAs of writing, not all the below document +links are live. They will be populated as we complete the documents.

+ +

Documents

+ + + + + + + + + +

Description

+ +

Ogg Vorbis is a general purpose compressed audio format +for high quality (44.1-48.0kHz, 16+ bit, polyphonic) audio and music +at moderate fixed and variable bitrates (40-80 kb/s/channel). This +places Vorbis in the same class as audio representations including +MPEG-1 audio layer 3, MPEG-4 audio (AAC and TwinVQ), and PAC.

+ +

Vorbis is the first of a planned family of Ogg multimedia coding +formats being developed as part of the Xiph.org Foundation's Ogg multimedia +project. See http://www.xiph.org/ +for more information.

+ +

Vorbis technical documents

+ +

A Vorbis encoder takes in overlapping (but contiguous) short-time +segments of audio data. The encoder analyzes the content of the audio +to determine an optimal compact representation; this phase of encoding +is known as analysis. For each short-time block of sound, +the encoder then packs an efficient representation of the signal, as +determined by analysis, into a raw packet much smaller than the size +required by the original signal; this phase is coding. +Lastly, in a streaming environment, the raw packets are then +structured into a continuous stream of octets; this last phase is +streaming. Note that the stream of octets is referred to both +as a 'byte-' and 'bit-'stream; the latter usage is acceptible as the +stream of octets is a physical representation of a true logical +bit-by-bit stream.

+ +

A Vorbis decoder performs a mirror image process of extracting the +original sequence of raw packets from an Ogg stream (stream +decomposition), reconstructing the signal representation from the +raw data in the packet (decoding) and them reconstituting an +audio signal from the decoded representation (synthesis).

+ +

The Programming with libvorbis +documents discuss use of the reference Vorbis codec library +(libvorbis) produced by the Xiph.org Foundation.

+ +

The data representations and algorithms necessary at each step to +encode and decode Ogg Vorbis bitstreams are described by the below +documents in sufficient detail to construct a complete Vorbis codec. +Note that at the time of writing, Vorbis is still in a 'Request For +Comments' stage of development; despite being in advanced stages of +development, input from the multimedia community is welcome.

+ +

Vorbis analysis and synthesis

+ +

Analysis begins by seperating an input audio stream into individual, +overlapping short-time segments of audio data. These segments are +then transformed into an alternate representation, seeking to +represent the original signal in a more efficient form that codes into +a smaller number of bytes. The analysis and transformation stage is +the most complex element of producing a Vorbis bitstream.

+ +

The corresponding synthesis step in the decoder is simpler; there is +no analysis to perform, merely a mechanical, deterministic +reconstruction of the original audio data from the transform-domain +representation.

+ + + +

Vorbis coding and decoding

+ +

Coding and decoding converts the transform-domain representation of +the original audio produced by analysis to and from a bitwise packed +raw data packet. Coding and decoding consist of two logically +orthogonal concepts, back-end coding and bitpacking.

+ +

Back-end coding uses a probability model to represent the raw numbers +of the audio representation in as few physical bits as possible; +familiar examples of back-end coding include Huffman coding and Vector +Quantization.

+ +

Bitpacking arranges the variable sized words of the back-end +coding into a vector of octets without wasting space. The octets +produced by coding a single short-time audio segment is one raw Vorbis +packet.

+ + + +

Vorbis streaming and stream decomposition

+ +

Vorbis packets contain the raw, bitwise-compressed representation of a +snippet of audio. These packets contain no structure and cannot be +strung together directly into a stream; for streamed transmission and +storage, Vorbis packets are encoded into an Ogg bitstream.

+ + + + + + + -- 2.7.4