mirror of
https://git.ryujinx.app/ryubing/ryujinx.git
synced 2025-04-28 08:25:13 +00:00
Unmerged PR from OG Ryujinx (#4367). From @gdkchan: > The main goal of this change is porting the loop filtering from libvpx, which should fix the block artifacts on some VP9 videos on games using NVDEC to decode them. In addition to that, there are two other changes: > > - The remaining decoder code required to decode a VP9 video (with headers included) has been added. That was done because it's much better to test the decoder standalone with a video file. I decided to keep that code on the emulator, even if some of it is unused, since it makes standalone testing easier in the future too, and we can include unit tests with video files. > - Large refactoring of both new and existing code to conform with our conding [sic] styles, done by @TSRBerry (thanks!) Some of it has been automated. > > Since we had no loop filtering before, this change will make video decoding slower. That may cause frame drop etc if the decoder is not fast enough in some games. I plan to optimize the decoder more in the future to make up for that, but if possible I'd prefer to not do it as part of this PR, but if the perf loss is too severe I might consider. > > This will need to be tested on games that had the block artifacts, it would be nice to confirm if they match hardware now, and get some before/after screenshots etc. Comment from @Bjorn29512: > Significantly improves the block artifacts in FE: Engage. > > Before: >  > > After: >  --------- Co-authored-by: gdkchan <gab.dark.100@gmail.com> Co-authored-by: TSR Berry <20988865+TSRBerry@users.noreply.github.com>
323 lines
13 KiB
C#
323 lines
13 KiB
C#
using Ryujinx.Common.Memory;
|
|
using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
|
|
using Ryujinx.Graphics.Nvdec.Vp9.Types;
|
|
using Ryujinx.Graphics.Video;
|
|
using System;
|
|
using System.Diagnostics;
|
|
using System.Runtime.InteropServices;
|
|
using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.InvTxfm;
|
|
|
|
namespace Ryujinx.Graphics.Nvdec.Vp9
|
|
{
|
|
internal static class Detokenize
|
|
{
|
|
private const int EobContextNode = 0;
|
|
private const int ZeroContextNode = 1;
|
|
private const int OneContextNode = 2;
|
|
|
|
private static int GetCoefContext(ReadOnlySpan<short> neighbors, ReadOnlySpan<byte> tokenCache, int c)
|
|
{
|
|
const int maxNeighbors = 2;
|
|
|
|
return (1 + tokenCache[neighbors[(maxNeighbors * c) + 0]] +
|
|
tokenCache[neighbors[(maxNeighbors * c) + 1]]) >> 1;
|
|
}
|
|
|
|
private static int DecodeCoefs(
|
|
ref MacroBlockD xd,
|
|
PlaneType type,
|
|
Span<int> dqcoeff,
|
|
TxSize txSize,
|
|
ref Array2<short> dq,
|
|
int ctx,
|
|
ReadOnlySpan<short> scan,
|
|
ReadOnlySpan<short> nb,
|
|
ref Reader r)
|
|
{
|
|
ref Vp9BackwardUpdates counts = ref xd.Counts.Value;
|
|
int maxEob = 16 << ((int)txSize << 1);
|
|
ref Vp9EntropyProbs fc = ref xd.Fc.Value;
|
|
int refr = xd.Mi[0].Value.IsInterBlock() ? 1 : 0;
|
|
int band, c = 0;
|
|
ref Array6<Array6<Array3<byte>>> coefProbs = ref fc.CoefProbs[(int)txSize][(int)type][refr];
|
|
Span<byte> tokenCache = stackalloc byte[32 * 32];
|
|
ReadOnlySpan<byte> bandTranslate = Luts.GetBandTranslate(txSize);
|
|
int dqShift = txSize == TxSize.Tx32x32 ? 1 : 0;
|
|
int v;
|
|
short dqv = dq[0];
|
|
ReadOnlySpan<byte> cat6Prob = xd.Bd == 12
|
|
? Luts.Cat6ProbHigh12
|
|
: xd.Bd == 10
|
|
? Luts.Cat6ProbHigh12.Slice(2)
|
|
: Luts.Cat6Prob;
|
|
int cat6Bits = xd.Bd == 12 ? 18 : xd.Bd == 10 ? 16 : 14;
|
|
// Keep value, range, and count as locals. The compiler produces better
|
|
// results with the locals than using r directly.
|
|
ulong value = r.Value;
|
|
uint range = r.Range;
|
|
int count = r.Count;
|
|
|
|
while (c < maxEob)
|
|
{
|
|
int val = -1;
|
|
band = bandTranslate[0];
|
|
bandTranslate = bandTranslate.Slice(1);
|
|
ref Array3<byte> prob = ref coefProbs[band][ctx];
|
|
if (!xd.Counts.IsNull)
|
|
{
|
|
++counts.EobBranch[(int)txSize][(int)type][refr][band][ctx];
|
|
}
|
|
|
|
if (r.ReadBool(prob[EobContextNode], ref value, ref count, ref range) == 0)
|
|
{
|
|
if (!xd.Counts.IsNull)
|
|
{
|
|
++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.EobModelToken];
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
while (r.ReadBool(prob[ZeroContextNode], ref value, ref count, ref range) == 0)
|
|
{
|
|
if (!xd.Counts.IsNull)
|
|
{
|
|
++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.ZeroToken];
|
|
}
|
|
|
|
dqv = dq[1];
|
|
tokenCache[scan[c]] = 0;
|
|
++c;
|
|
if (c >= maxEob)
|
|
{
|
|
r.Value = value;
|
|
r.Range = range;
|
|
r.Count = count;
|
|
return c; // Zero tokens at the end (no eob token)
|
|
}
|
|
|
|
ctx = GetCoefContext(nb, tokenCache, c);
|
|
band = bandTranslate[0];
|
|
bandTranslate = bandTranslate.Slice(1);
|
|
prob = ref coefProbs[band][ctx];
|
|
}
|
|
|
|
if (r.ReadBool(prob[OneContextNode], ref value, ref count, ref range) != 0)
|
|
{
|
|
ReadOnlySpan<byte> p = Luts.Pareto8Full[prob[Constants.PivotNode] - 1];
|
|
if (!xd.Counts.IsNull)
|
|
{
|
|
++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.TwoToken];
|
|
}
|
|
|
|
if (r.ReadBool(p[0], ref value, ref count, ref range) != 0)
|
|
{
|
|
if (r.ReadBool(p[3], ref value, ref count, ref range) != 0)
|
|
{
|
|
tokenCache[scan[c]] = 5;
|
|
if (r.ReadBool(p[5], ref value, ref count, ref range) != 0)
|
|
{
|
|
if (r.ReadBool(p[7], ref value, ref count, ref range) != 0)
|
|
{
|
|
val = Constants.Cat6MinVal + r.ReadCoeff(cat6Prob, cat6Bits, ref value,
|
|
ref count, ref range);
|
|
}
|
|
else
|
|
{
|
|
val = Constants.Cat5MinVal + r.ReadCoeff(Luts.Cat5Prob, 5, ref value,
|
|
ref count, ref range);
|
|
}
|
|
}
|
|
else if (r.ReadBool(p[6], ref value, ref count, ref range) != 0)
|
|
{
|
|
val = Constants.Cat4MinVal + r.ReadCoeff(Luts.Cat4Prob, 4, ref value, ref count,
|
|
ref range);
|
|
}
|
|
else
|
|
{
|
|
val = Constants.Cat3MinVal + r.ReadCoeff(Luts.Cat3Prob, 3, ref value, ref count,
|
|
ref range);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
tokenCache[scan[c]] = 4;
|
|
if (r.ReadBool(p[4], ref value, ref count, ref range) != 0)
|
|
{
|
|
val = Constants.Cat2MinVal + r.ReadCoeff(Luts.Cat2Prob, 2, ref value, ref count,
|
|
ref range);
|
|
}
|
|
else
|
|
{
|
|
val = Constants.Cat1MinVal + r.ReadCoeff(Luts.Cat1Prob, 1, ref value, ref count,
|
|
ref range);
|
|
}
|
|
}
|
|
|
|
// Val may use 18-bits
|
|
v = (int)(((long)val * dqv) >> dqShift);
|
|
}
|
|
else
|
|
{
|
|
if (r.ReadBool(p[1], ref value, ref count, ref range) != 0)
|
|
{
|
|
tokenCache[scan[c]] = 3;
|
|
v = ((3 + r.ReadBool(p[2], ref value, ref count, ref range)) * dqv) >> dqShift;
|
|
}
|
|
else
|
|
{
|
|
tokenCache[scan[c]] = 2;
|
|
v = (2 * dqv) >> dqShift;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!xd.Counts.IsNull)
|
|
{
|
|
++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.OneToken];
|
|
}
|
|
|
|
tokenCache[scan[c]] = 1;
|
|
v = dqv >> dqShift;
|
|
}
|
|
|
|
dqcoeff[scan[c]] = (int)HighbdCheckRange(r.ReadBool(128, ref value, ref count, ref range) != 0 ? -v : v,
|
|
xd.Bd);
|
|
++c;
|
|
ctx = GetCoefContext(nb, tokenCache, c);
|
|
dqv = dq[1];
|
|
}
|
|
|
|
r.Value = value;
|
|
r.Range = range;
|
|
r.Count = count;
|
|
return c;
|
|
}
|
|
|
|
private static void GetCtxShift(ref MacroBlockD xd, ref int ctxShiftA, ref int ctxShiftL, int x, int y,
|
|
uint txSizeInBlocks)
|
|
{
|
|
if (xd.MaxBlocksWide != 0)
|
|
{
|
|
if (txSizeInBlocks + x > xd.MaxBlocksWide)
|
|
{
|
|
ctxShiftA = (int)(txSizeInBlocks - (xd.MaxBlocksWide - x)) * 8;
|
|
}
|
|
}
|
|
|
|
if (xd.MaxBlocksHigh != 0)
|
|
{
|
|
if (txSizeInBlocks + y > xd.MaxBlocksHigh)
|
|
{
|
|
ctxShiftL = (int)(txSizeInBlocks - (xd.MaxBlocksHigh - y)) * 8;
|
|
}
|
|
}
|
|
}
|
|
|
|
private static PlaneType GetPlaneType(int plane)
|
|
{
|
|
return (PlaneType)(plane > 0 ? 1 : 0);
|
|
}
|
|
|
|
public static int DecodeBlockTokens(
|
|
ref TileWorkerData twd,
|
|
int plane,
|
|
Luts.ScanOrder sc,
|
|
int x,
|
|
int y,
|
|
TxSize txSize,
|
|
int segId)
|
|
{
|
|
ref Reader r = ref twd.BitReader;
|
|
ref MacroBlockD xd = ref twd.Xd;
|
|
ref MacroBlockDPlane pd = ref xd.Plane[plane];
|
|
ref Array2<short> dequant = ref pd.SegDequant[segId];
|
|
int eob;
|
|
Span<sbyte> a = pd.AboveContext.AsSpan().Slice(x);
|
|
Span<sbyte> l = pd.LeftContext.AsSpan().Slice(y);
|
|
int ctx;
|
|
int ctxShiftA = 0;
|
|
int ctxShiftL = 0;
|
|
|
|
switch (txSize)
|
|
{
|
|
case TxSize.Tx4x4:
|
|
ctx = a[0] != 0 ? 1 : 0;
|
|
ctx += l[0] != 0 ? 1 : 0;
|
|
eob = DecodeCoefs(
|
|
ref xd,
|
|
GetPlaneType(plane),
|
|
pd.DqCoeff.AsSpan(),
|
|
txSize,
|
|
ref dequant,
|
|
ctx,
|
|
sc.Scan,
|
|
sc.Neighbors,
|
|
ref r);
|
|
a[0] = l[0] = (sbyte)(eob > 0 ? 1 : 0);
|
|
break;
|
|
case TxSize.Tx8x8:
|
|
GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx8x8);
|
|
ctx = MemoryMarshal.Cast<sbyte, ushort>(a)[0] != 0 ? 1 : 0;
|
|
ctx += MemoryMarshal.Cast<sbyte, ushort>(l)[0] != 0 ? 1 : 0;
|
|
eob = DecodeCoefs(
|
|
ref xd,
|
|
GetPlaneType(plane),
|
|
pd.DqCoeff.AsSpan(),
|
|
txSize,
|
|
ref dequant,
|
|
ctx,
|
|
sc.Scan,
|
|
sc.Neighbors,
|
|
ref r);
|
|
MemoryMarshal.Cast<sbyte, ushort>(a)[0] = (ushort)((eob > 0 ? 0x0101 : 0) >> ctxShiftA);
|
|
MemoryMarshal.Cast<sbyte, ushort>(l)[0] = (ushort)((eob > 0 ? 0x0101 : 0) >> ctxShiftL);
|
|
break;
|
|
case TxSize.Tx16x16:
|
|
GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx16x16);
|
|
ctx = MemoryMarshal.Cast<sbyte, uint>(a)[0] != 0 ? 1 : 0;
|
|
ctx += MemoryMarshal.Cast<sbyte, uint>(l)[0] != 0 ? 1 : 0;
|
|
eob = DecodeCoefs(
|
|
ref xd,
|
|
GetPlaneType(plane),
|
|
pd.DqCoeff.AsSpan(),
|
|
txSize,
|
|
ref dequant,
|
|
ctx,
|
|
sc.Scan,
|
|
sc.Neighbors,
|
|
ref r);
|
|
MemoryMarshal.Cast<sbyte, uint>(a)[0] = (uint)((eob > 0 ? 0x01010101 : 0) >> ctxShiftA);
|
|
MemoryMarshal.Cast<sbyte, uint>(l)[0] = (uint)((eob > 0 ? 0x01010101 : 0) >> ctxShiftL);
|
|
break;
|
|
case TxSize.Tx32x32:
|
|
GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx32x32);
|
|
// NOTE: Casting to ulong here is safe because the default memory
|
|
// alignment is at least 8 bytes and the Tx32x32 is aligned on 8 byte
|
|
// boundaries.
|
|
ctx = MemoryMarshal.Cast<sbyte, ulong>(a)[0] != 0 ? 1 : 0;
|
|
ctx += MemoryMarshal.Cast<sbyte, ulong>(l)[0] != 0 ? 1 : 0;
|
|
eob = DecodeCoefs(
|
|
ref xd,
|
|
GetPlaneType(plane),
|
|
pd.DqCoeff.AsSpan(),
|
|
txSize,
|
|
ref dequant,
|
|
ctx,
|
|
sc.Scan,
|
|
sc.Neighbors,
|
|
ref r);
|
|
MemoryMarshal.Cast<sbyte, ulong>(a)[0] = (eob > 0 ? 0x0101010101010101UL : 0) >> ctxShiftA;
|
|
MemoryMarshal.Cast<sbyte, ulong>(l)[0] = (eob > 0 ? 0x0101010101010101UL : 0) >> ctxShiftL;
|
|
break;
|
|
default:
|
|
Debug.Assert(false, "Invalid transform size.");
|
|
eob = 0;
|
|
break;
|
|
}
|
|
|
|
return eob;
|
|
}
|
|
}
|
|
} |