[maxwell/fermi_2d] gutter entire Software blitter

Signed-off-by: lizzie <lizzie@eden-emu.dev>
4 months ago · d3ed681bc1
7 changed files with 3 additions and 1550 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -37,10 +37,6 @@ add_library(video_core STATIC
    dirty_flags.h
    dma_pusher.cpp
    dma_pusher.h
-    engines/sw_blitter/blitter.cpp
-    engines/sw_blitter/blitter.h
-    engines/sw_blitter/converter.cpp
-    engines/sw_blitter/converter.h
    engines/const_buffer_info.h
    engines/draw_manager.cpp
    engines/draw_manager.h
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@ -7,7 +7,6 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "video_core/engines/fermi_2d.h"
-#include "video_core/engines/sw_blitter/blitter.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
@ -21,7 +20,6 @@ namespace Tegra::Engines {
 using namespace Texture;

 Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
-    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);
    // Nvidia's OpenGL driver seems to assume these values
    regs.src.depth = 1;
    regs.dst.depth = 1;
@ -112,9 +110,9 @@ void Fermi2D::Blit() {
    }

    memory_manager.FlushCaching();
-    if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
-        sw_blitter->Blit(src, regs.dst, config);
-    }
+
+    bool was_accelerated = rasterizer->AccelerateSurfaceCopy(src, regs.dst, config);
+    ASSERT(was_accelerated && "Backend must always accelerate copies/blits");
 }

 } // namespace Tegra::Engines
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@ -304,7 +304,6 @@ public:

 private:
    VideoCore::RasterizerInterface* rasterizer = nullptr;
-    std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
    MemoryManager& memory_manager;

    /// Performs the copy from the source surface to the destination surface as configured in the
--- a/src/video_core/engines/sw_blitter/blitter.cpp
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@ -1,240 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "common/scratch_buffer.h"
-#include "video_core/engines/sw_blitter/blitter.h"
-#include "video_core/engines/sw_blitter/converter.h"
-#include "video_core/guest_memory.h"
-#include "video_core/memory_manager.h"
-#include "video_core/surface.h"
-#include "video_core/textures/decoders.h"
-
-namespace Tegra {
-class MemoryManager;
-}
-
-using VideoCore::Surface::BytesPerBlock;
-using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
-
-namespace Tegra::Engines::Blitter {
-
-using namespace Texture;
-
-namespace {
-
-constexpr size_t ir_components = 4;
-
-void NearestNeighbor(std::span<const u8> input, std::span<u8> output, u32 src_width, u32 src_height,
-                     u32 dst_width, u32 dst_height, size_t bpp) {
-    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
-    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
-    size_t src_y = 0;
-    for (u32 y = 0; y < dst_height; y++) {
-        size_t src_x = 0;
-        for (u32 x = 0; x < dst_width; x++) {
-            const size_t read_from = ((src_y * src_width + src_x) >> 32) * bpp;
-            const size_t write_to = (y * dst_width + x) * bpp;
-
-            std::memcpy(&output[write_to], &input[read_from], bpp);
-            src_x += dx_du;
-        }
-        src_y += dy_dv;
-    }
-}
-
-void NearestNeighborFast(std::span<const f32> input, std::span<f32> output, u32 src_width,
-                         u32 src_height, u32 dst_width, u32 dst_height) {
-    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
-    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
-    size_t src_y = 0;
-    for (u32 y = 0; y < dst_height; y++) {
-        size_t src_x = 0;
-        for (u32 x = 0; x < dst_width; x++) {
-            const size_t read_from = ((src_y * src_width + src_x) >> 32) * ir_components;
-            const size_t write_to = (y * dst_width + x) * ir_components;
-
-            std::memcpy(&output[write_to], &input[read_from], sizeof(f32) * ir_components);
-            src_x += dx_du;
-        }
-        src_y += dy_dv;
-    }
-}
-
-void Bilinear(std::span<const f32> input, std::span<f32> output, size_t src_width,
-              size_t src_height, size_t dst_width, size_t dst_height) {
-    const auto bilinear_sample = [](std::span<const f32> x0_y0, std::span<const f32> x1_y0,
-                                    std::span<const f32> x0_y1, std::span<const f32> x1_y1,
-                                    f32 weight_x, f32 weight_y) {
-        std::array<f32, ir_components> result{};
-        for (size_t i = 0; i < ir_components; i++) {
-            const f32 a = std::lerp(x0_y0[i], x1_y0[i], weight_x);
-            const f32 b = std::lerp(x0_y1[i], x1_y1[i], weight_x);
-            result[i] = std::lerp(a, b, weight_y);
-        }
-        return result;
-    };
-    const f32 dx_du =
-        dst_width > 1 ? static_cast<f32>(src_width - 1) / static_cast<f32>(dst_width - 1) : 0.f;
-    const f32 dy_dv =
-        dst_height > 1 ? static_cast<f32>(src_height - 1) / static_cast<f32>(dst_height - 1) : 0.f;
-    for (u32 y = 0; y < dst_height; y++) {
-        for (u32 x = 0; x < dst_width; x++) {
-            const f32 x_low = std::floor(static_cast<f32>(x) * dx_du);
-            const f32 y_low = std::floor(static_cast<f32>(y) * dy_dv);
-            const f32 x_high = std::ceil(static_cast<f32>(x) * dx_du);
-            const f32 y_high = std::ceil(static_cast<f32>(y) * dy_dv);
-            const f32 weight_x = (static_cast<f32>(x) * dx_du) - x_low;
-            const f32 weight_y = (static_cast<f32>(y) * dy_dv) - y_low;
-
-            const auto read_src = [&](f32 in_x, f32 in_y) {
-                const size_t read_from =
-                    ((static_cast<size_t>(in_x) * src_width + static_cast<size_t>(in_y)) >> 32) *
-                    ir_components;
-                return std::span<const f32>(&input[read_from], ir_components);
-            };
-
-            auto x0_y0 = read_src(x_low, y_low);
-            auto x1_y0 = read_src(x_high, y_low);
-            auto x0_y1 = read_src(x_low, y_high);
-            auto x1_y1 = read_src(x_high, y_high);
-
-            const auto result = bilinear_sample(x0_y0, x1_y0, x0_y1, x1_y1, weight_x, weight_y);
-
-            const size_t write_to = (y * dst_width + x) * ir_components;
-
-            std::memcpy(&output[write_to], &result, sizeof(f32) * ir_components);
-        }
-    }
-}
-
-template <bool unpack>
-void ProcessPitchLinear(std::span<const u8> input, std::span<u8> output, size_t extent_x,
-                        size_t extent_y, u32 pitch, u32 x0, u32 y0, size_t bpp) {
-    const size_t base_offset = x0 * bpp;
-    const size_t copy_size = extent_x * bpp;
-    for (size_t y = 0; y < extent_y; y++) {
-        const size_t first_offset = (y + y0) * pitch + base_offset;
-        const size_t second_offset = y * extent_x * bpp;
-        u8* write_to = unpack ? &output[first_offset] : &output[second_offset];
-        const u8* read_from = unpack ? &input[second_offset] : &input[first_offset];
-        std::memcpy(write_to, read_from, copy_size);
-    }
-}
-
-} // namespace
-
-struct SoftwareBlitEngine::BlitEngineImpl {
-    Common::ScratchBuffer<u8> tmp_buffer;
-    Common::ScratchBuffer<u8> src_buffer;
-    Common::ScratchBuffer<u8> dst_buffer;
-    Common::ScratchBuffer<f32> intermediate_src;
-    Common::ScratchBuffer<f32> intermediate_dst;
-    ConverterFactory converter_factory;
-};
-
-SoftwareBlitEngine::SoftwareBlitEngine(MemoryManager& memory_manager_)
-    : memory_manager{memory_manager_} {
-    impl = std::make_unique<BlitEngineImpl>();
-}
-
-SoftwareBlitEngine::~SoftwareBlitEngine() = default;
-
-bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
-                              Fermi2D::Config& config) {
-    const auto get_surface_size = [](Fermi2D::Surface& surface, u32 bytes_per_pixel) {
-        if (surface.linear == Fermi2D::MemoryLayout::BlockLinear) {
-            return CalculateSize(true, bytes_per_pixel, surface.width, surface.height,
-                                 surface.depth, surface.block_height, surface.block_depth);
-        }
-        return static_cast<size_t>(surface.pitch * surface.height);
-    };
-
-    const u32 src_extent_x = config.src_x1 - config.src_x0;
-    const u32 src_extent_y = config.src_y1 - config.src_y0;
-
-    const u32 dst_extent_x = config.dst_x1 - config.dst_x0;
-    const u32 dst_extent_y = config.dst_y1 - config.dst_y0;
-    const auto src_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
-    const auto dst_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(dst.format));
-    const size_t src_size = get_surface_size(src, src_bytes_per_pixel);
-
-    Tegra::Memory::GpuGuestMemory<u8, Tegra::Memory::GuestMemoryFlags::SafeRead> tmp_buffer(
-        memory_manager, src.Address(), src_size, &impl->tmp_buffer);
-
-    const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel;
-    const size_t dst_copy_size = dst_extent_x * dst_extent_y * dst_bytes_per_pixel;
-
-    impl->src_buffer.resize_destructive(src_copy_size);
-
-    const bool no_passthrough =
-        src.format != dst.format || src_extent_x != dst_extent_x || src_extent_y != dst_extent_y;
-
-    const auto conversion_phase_same_format = [&]() {
-        NearestNeighbor(impl->src_buffer, impl->dst_buffer, src_extent_x, src_extent_y,
-                        dst_extent_x, dst_extent_y, dst_bytes_per_pixel);
-    };
-
-    const auto conversion_phase_ir = [&]() {
-        auto* input_converter = impl->converter_factory.GetFormatConverter(src.format);
-        impl->intermediate_src.resize_destructive((src_copy_size / src_bytes_per_pixel) *
-                                                  ir_components);
-        impl->intermediate_dst.resize_destructive((dst_copy_size / dst_bytes_per_pixel) *
-                                                  ir_components);
-        input_converter->ConvertTo(impl->src_buffer, impl->intermediate_src);
-
-        if (config.filter != Fermi2D::Filter::Bilinear) {
-            NearestNeighborFast(impl->intermediate_src, impl->intermediate_dst, src_extent_x,
-                                src_extent_y, dst_extent_x, dst_extent_y);
-        } else {
-            Bilinear(impl->intermediate_src, impl->intermediate_dst, src_extent_x, src_extent_y,
-                     dst_extent_x, dst_extent_y);
-        }
-
-        auto* output_converter = impl->converter_factory.GetFormatConverter(dst.format);
-        output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer);
-    };
-
-    // Do actual Blit
-
-    impl->dst_buffer.resize_destructive(dst_copy_size);
-    if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
-        UnswizzleSubrect(impl->src_buffer, tmp_buffer, src_bytes_per_pixel, src.width, src.height,
-                         src.depth, config.src_x0, config.src_y0, src_extent_x, src_extent_y,
-                         src.block_height, src.block_depth, src_extent_x * src_bytes_per_pixel);
-    } else {
-        ProcessPitchLinear<false>(tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
-                                  src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
-    }
-
-    // Conversion Phase
-    if (no_passthrough) {
-        if (src.format != dst.format || config.filter == Fermi2D::Filter::Bilinear) {
-            conversion_phase_ir();
-        } else {
-            conversion_phase_same_format();
-        }
-    } else {
-        impl->dst_buffer.swap(impl->src_buffer);
-    }
-
-    const size_t dst_size = get_surface_size(dst, dst_bytes_per_pixel);
-    Tegra::Memory::GpuGuestMemoryScoped<u8, Tegra::Memory::GuestMemoryFlags::SafeReadWrite>
-        tmp_buffer2(memory_manager, dst.Address(), dst_size, &impl->tmp_buffer);
-
-    if (dst.linear == Fermi2D::MemoryLayout::BlockLinear) {
-        SwizzleSubrect(tmp_buffer2, impl->dst_buffer, dst_bytes_per_pixel, dst.width, dst.height,
-                       dst.depth, config.dst_x0, config.dst_y0, dst_extent_x, dst_extent_y,
-                       dst.block_height, dst.block_depth, dst_extent_x * dst_bytes_per_pixel);
-    } else {
-        ProcessPitchLinear<true>(impl->dst_buffer, tmp_buffer2, dst_extent_x, dst_extent_y,
-                                 dst.pitch, config.dst_x0, config.dst_y0,
-                                 static_cast<size_t>(dst_bytes_per_pixel));
-    }
-    return true;
-}
-
-} // namespace Tegra::Engines::Blitter
--- a/src/video_core/engines/sw_blitter/blitter.h
+++ b/src/video_core/engines/sw_blitter/blitter.h
@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-#pragma once
-
-#include "video_core/engines/fermi_2d.h"
-
-namespace Tegra {
-class MemoryManager;
-}
-
-namespace Tegra::Engines::Blitter {
-
-class SoftwareBlitEngine {
-public:
-    explicit SoftwareBlitEngine(MemoryManager& memory_manager_);
-    ~SoftwareBlitEngine();
-
-    bool Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, Fermi2D::Config& copy_config);
-
-private:
-    MemoryManager& memory_manager;
-    struct BlitEngineImpl;
-    std::unique_ptr<BlitEngineImpl> impl;
-};
-
-} // namespace Tegra::Engines::Blitter
--- a/src/video_core/engines/sw_blitter/converter.cpp
+++ b/src/video_core/engines/sw_blitter/converter.cpp
--- a/src/video_core/engines/sw_blitter/converter.h
+++ b/src/video_core/engines/sw_blitter/converter.h
@ -1,36 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-#pragma once
-
-#include <memory>
-#include <span>
-
-#include "common/common_types.h"
-
-#include "video_core/gpu.h"
-
-namespace Tegra::Engines::Blitter {
-
-class Converter {
-public:
-    virtual void ConvertTo(std::span<const u8> input, std::span<f32> output) = 0;
-    virtual void ConvertFrom(std::span<const f32> input, std::span<u8> output) = 0;
-    virtual ~Converter() = default;
-};
-
-class ConverterFactory {
-public:
-    ConverterFactory();
-    ~ConverterFactory();
-
-    Converter* GetFormatConverter(RenderTargetFormat format);
-
-private:
-    Converter* BuildConverter(RenderTargetFormat format);
-
-    struct ConverterFactoryImpl;
-    std::unique_ptr<ConverterFactoryImpl> impl;
-};
-
-} // namespace Tegra::Engines::Blitter