Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Video decoder for data_pipeline #200

Open
wants to merge 44 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
3e2183c
set up video decoder
am831 Nov 2, 2023
4c8db4b
header file
am831 Nov 2, 2023
bee4972
Merge branch 'main' of https://github.com/facebookresearch/fairseq2 i…
am831 Nov 4, 2023
10986bb
libavcodec
am831 Nov 4, 2023
08943ea
video decoder progress with libavcodec/libavformat
am831 Nov 6, 2023
d493e00
Merge branch 'facebookresearch:main' into vid_decoding
am831 Nov 6, 2023
f9902cf
fix library linking
am831 Nov 7, 2023
37853e9
video decoder debugging
am831 Nov 8, 2023
f2a9dcf
fix seg fault
am831 Nov 9, 2023
4a9645b
video decoder progress
am831 Nov 10, 2023
72e650b
add open_streams, deocde_frame, utils.h
am831 Nov 11, 2023
4fd41de
decoder progress
am831 Nov 12, 2023
03c4681
Merge branch 'facebookresearch:main' into vid_decoding
am831 Nov 13, 2023
9cb21fb
decoder progress
am831 Nov 13, 2023
bb2bafb
decoder progress
am831 Nov 14, 2023
9f952ac
combine decoder functions. write data into tensor
am831 Nov 15, 2023
5e61ae7
convert video frames to rgb
am831 Nov 19, 2023
a5f9ce9
decode video frames
am831 Nov 27, 2023
43b8049
improve design
am831 Nov 28, 2023
687eb2d
Merge branch 'facebookresearch:main' into vid_decoding
am831 Nov 28, 2023
1cb04e2
file names
am831 Nov 28, 2023
03bc600
Merge branch 'facebookresearch:main' into vid_decoding
am831 Nov 28, 2023
530edf9
linker error
am831 Nov 29, 2023
05f03aa
linker error
am831 Nov 29, 2023
da11a5e
everything works
am831 Dec 1, 2023
8ad19e6
clean up
am831 Dec 2, 2023
e2b1edd
Merge branch 'facebookresearch:main' into vid_decoding
am831 Dec 2, 2023
1aefe54
clean up
am831 Dec 2, 2023
96c5f6f
remove unused library
am831 Dec 2, 2023
dddc3a1
fix dtype
am831 Dec 2, 2023
914d1d5
class for libswscale resources
am831 Dec 3, 2023
6e42f36
clang tidy
am831 Dec 3, 2023
e790178
reformat
am831 Dec 3, 2023
c1fbd06
unit test
am831 Dec 3, 2023
483c7d1
unit test
am831 Dec 3, 2023
d1f6d17
probe format
am831 Dec 3, 2023
e06f7f4
transform class
am831 Dec 4, 2023
b20694e
more options in video_decoder_options
am831 Dec 5, 2023
ebada55
more options for video_decoder_options
am831 Dec 5, 2023
13111ab
clean up
am831 Dec 6, 2023
33cae96
clean up
am831 Dec 7, 2023
155f3b1
Merge branch 'main' of https://github.com/facebookresearch/fairseq2 i…
am831 Dec 8, 2023
e07fd1f
fix cmake
am831 Dec 8, 2023
94570e9
clang tidy
am831 Dec 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fairseq2n/python/src/fairseq2n/bindings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ target_sources(py_bindings
data/text/init.cc
data/text/sentencepiece.cc
data/text/text_reader.cc
data/video.cc
type_casters/data.cc
type_casters/map_fn.cc
type_casters/string.cc
Expand Down
2 changes: 2 additions & 0 deletions fairseq2n/python/src/fairseq2n/bindings/data/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def_data(py::module_ &base)

def_audio(m);

def_video(m);

def_image(m);

def_data_pipeline(m);
Expand Down
61 changes: 61 additions & 0 deletions fairseq2n/python/src/fairseq2n/bindings/data/video.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// All rights reserved.
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include "fairseq2n/bindings/module.h"

#include <cstdint>
#include <memory>

#include <ATen/Device.h>
#include <ATen/ScalarType.h>

#include <fairseq2n/float.h>
#include <fairseq2n/data/video/video_decoder.h>

namespace py = pybind11;

namespace fairseq2n {

void
def_video(py::module_ &data_module)
{
py::module_ m = data_module.def_submodule("video");

// VideoDecoder
py::class_<video_decoder, std::shared_ptr<video_decoder>>(m, "VideoDecoder")
.def(
py::init([](
std::optional<at::ScalarType> maybe_dtype,
std::optional<at::Device> maybe_device,
bool pin_memory,
bool get_pts_only,
bool get_frames_only,
int width = 0,
int height = 0)
{
auto opts = video_decoder_options()
.maybe_dtype(maybe_dtype)
.maybe_device(maybe_device)
.pin_memory(pin_memory)
.get_pts_only(get_pts_only)
.get_frames_only(get_frames_only)
.width(width)
.height(height);

return std::make_shared<video_decoder>(opts);
}),
py::arg("dtype") = std::nullopt,
py::arg("device") = std::nullopt,
py::arg("pin_memory") = false,
py::arg("get_pts_only") = false,
py::arg("get_frames_only") = false,
py::arg("width") = 0,
py::arg("height") = 0)
.def("__call__", &video_decoder::operator(), py::call_guard<py::gil_scoped_release>{});

map_functors().register_<video_decoder>();
}
} // namespace fairseq2n
3 changes: 3 additions & 0 deletions fairseq2n/python/src/fairseq2n/bindings/module.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,7 @@ def_text_converters(pybind11::module_ &text_module);
void
def_text_reader(pybind11::module_ &text_module);

void
def_video(pybind11::module_ &data_module);

} // namespace fairseq2n
25 changes: 25 additions & 0 deletions fairseq2n/src/fairseq2n/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ target_sources(fairseq2n
data/text/sentencepiece/sp_encoder.cc
data/text/sentencepiece/sp_model.cc
data/text/sentencepiece/sp_processor.cc
data/video/video_decoder.cc
data/video/detail/ffmpeg.cc
data/video/detail/stream.cc
data/video/detail/transform.cc
)

if(FAIRSEQ2N_SUPPORT_IMAGE)
Expand Down Expand Up @@ -115,12 +119,33 @@ target_include_directories(fairseq2n ${system}
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/src>
)

find_path(AVCODEC_INCLUDE_DIR libavcodec/avcodec.h)
find_library(AVCODEC_LIBRARY avcodec)
find_path(AVFORMAT_INCLUDE_DIR libavformat/avformat.h)
find_library(AVFORMAT_LIBRARY avformat)
find_path(AVUTIL_INCLUDE_DIR libavutil/avutil.h)
find_library(AVUTIL_LIBRARY avutil)
find_path(SWSCALE_INCLUDE_DIR libswscale/swscale.h)
find_library(SWSCALE_LIBRARY swscale)

target_include_directories(fairseq2n
PRIVATE
${AVCODEC_INCLUDE_DIR}
${AVFORMAT_INCLUDE_DIR}
${AVUTIL_INCLUDE_DIR}
${SWSCALE_INCLUDE_DIR}
)

find_package(PNG REQUIRED)
find_package(JPEG REQUIRED)

target_link_libraries(fairseq2n
PRIVATE
${CMAKE_DL_LIBS}
${AVCODEC_LIBRARY}
${AVFORMAT_LIBRARY}
${AVUTIL_LIBRARY}
${SWSCALE_LIBRARY}
PRIVATE
fmt::fmt
Iconv::Iconv
Expand Down
211 changes: 211 additions & 0 deletions fairseq2n/src/fairseq2n/data/video/detail/ffmpeg.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// All rights reserved.
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include "fairseq2n/data/video/detail/ffmpeg.h"

#include <cstdint>
#include <exception>
#include <stdexcept>

#include <ATen/Functions.h>
#include <ATen/Tensor.h>

#include "fairseq2n/exception.h"
#include "fairseq2n/float.h"
#include "fairseq2n/fmt.h"
#include "fairseq2n/memory.h"
#include "fairseq2n/data/detail/tensor_helpers.h"
#include "fairseq2n/detail/exception.h"

using namespace std;

namespace fairseq2n::detail {

ffmpeg_decoder::ffmpeg_decoder(video_decoder_options opts)
: opts_{opts}
{}

data_dict
ffmpeg_decoder::open_container(const memory_block &block)
{
// Opens the media container and iterates over the streams.

auto data_ptr = reinterpret_cast<const uint8_t*>(block.data());
size_t data_size = block.size();
fairseq2n::detail::buffer_data bd = {data_ptr, data_size, this};
int ret = 0;

fmt_ctx_ = avformat_alloc_context();
if (fmt_ctx_ == nullptr) {
throw_<runtime_error>("Failed to allocate AVFormatContext.");
}
// Allocate buffer for input/output operations via AVIOContext
avio_ctx_buffer_ = static_cast<uint8_t*>(av_malloc(data_size + AV_INPUT_BUFFER_PADDING_SIZE));
if (avio_ctx_buffer_ == nullptr) {
throw_<runtime_error>("Failed to allocate AVIOContext buffer.");
}
// Create an AVIOContext for using custom IO
avio_ctx_ = avio_alloc_context(
avio_ctx_buffer_,
data_size,
0, // Write flag
&bd, // Pointer to user data
&read_callback, // Read function
nullptr, // Write function, not used
nullptr // Seek function, not used
);
if (avio_ctx_ == nullptr) {
throw_<runtime_error>("Failed to allocate AVIOContext.");
}
if (!read_callback_error_message().empty()) {
throw_<runtime_error>("Size is too large to fit in an int");
}

fmt_ctx_->pb = avio_ctx_;
fmt_ctx_->flags |= AVFMT_FLAG_CUSTOM_IO;
fmt_ctx_->flags |= AVFMT_FLAG_NONBLOCK;

// Determine the input format
fmt_ctx_->iformat = nullptr;
if (data_size <= std::numeric_limits<int>::max()) {
AVProbeData probe_data = {nullptr, avio_ctx_buffer_, static_cast<int>(data_size), nullptr};
fmt_ctx_->iformat = av_probe_input_format(&probe_data, 1);
}

// Open media and read the header
ret = avformat_open_input(&fmt_ctx_, nullptr, fmt_ctx_->iformat, nullptr);
if (ret < 0) {
throw_with_nested<invalid_argument>("Failed to open input.");
}

// Read data from the media
ret = avformat_find_stream_info(fmt_ctx_, nullptr);
if (ret < 0) {
throw_<runtime_error>("Failed to find stream information.");
}

// Iterate over all streams
flat_hash_map<std::string, data> all_streams;
for (int i = 0; i < static_cast<int>(fmt_ctx_->nb_streams); i++) {
all_streams[std::to_string(i)] = open_stream(i);
}

return all_streams;
}

data_dict
ffmpeg_decoder::open_stream(int stream_index)
{
// Opens a stream and decodes the video frames. Skips all streams that are not video for now.

av_stream_ = std::make_unique<stream>(stream_index, *fmt_ctx_);
int processed_frames = 0;
if (av_stream_->type_ == AVMEDIA_TYPE_VIDEO) {
av_stream_->alloc_resources();

// Fill codec context with codec parameters
int ret = avcodec_parameters_to_context(av_stream_->codec_ctx_, av_stream_->codec_params_);
if (ret < 0) {
throw_<runtime_error>("Failed to copy decoder parameters to input decoder context for stream {}\n",
stream_index);
}

// Open the codec
ret = avcodec_open2(av_stream_->codec_ctx_, av_stream_->codec_, nullptr);
if (ret < 0) {
throw_<runtime_error>("Failed to open decoder for stream {}\n", stream_index);
}

// Create tensor storage for the stream
av_stream_->init_tensor_storage(opts_);
// Iterate over all frames in the stream and decode them
while (av_read_frame(fmt_ctx_, av_stream_->pkt_) >= 0) {
if (av_stream_->pkt_->stream_index == stream_index) {
// Send raw data packet (compressed frame) to the decoder through the codec context
ret = avcodec_send_packet(av_stream_->codec_ctx_, av_stream_->pkt_);
if (ret < 0) {
throw_<runtime_error>("Error sending packet to decoder for stream {}\n",
stream_index);
}
// Receive raw data frame (uncompressed frame) from the decoder through the codec context
while (ret >= 0) {
ret = avcodec_receive_frame(av_stream_->codec_ctx_, av_stream_->frame_);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
break;
// EAGAIN is not an error, it means we need more input
// AVERROR_EOF means decoding finished
} else if (ret < 0) {
throw_<runtime_error>("Error receiving frame from decoder for stream {}\n",
stream_index);
}
// Tranform frame to RGB to guarantee 3 color channels
sws_ = std::make_unique<transform>(av_stream_->frame_->width, av_stream_->frame_->height,
static_cast<AVPixelFormat>(av_stream_->frame_->format), opts_);
sws_->transform_to_rgb(*av_stream_->sw_frame_, *av_stream_->frame_, stream_index, opts_);
// Store PTS in microseconds
if (!opts_.get_frames_only()) {
av_stream_->tensor_storage_.frame_pts[processed_frames] = av_stream_->frame_->pts * av_stream_->metadata_.time_base * 1000000;
}
// Store raw frame data for one frame
if (!opts_.get_pts_only()) {
at::Tensor one_frame = av_stream_->tensor_storage_.all_video_frames[processed_frames];
writable_memory_span frame_bits = get_raw_mutable_storage(one_frame);
auto frame_data = reinterpret_cast<uint8_t*>(frame_bits.data());
// Get total size of the frame in bytes
int frame_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, av_stream_->sw_frame_->width,
av_stream_->sw_frame_->height, 1);
// Copy the entire frame at once
memcpy(frame_data, av_stream_->sw_frame_->data[0], frame_size);
}
processed_frames++;

av_frame_unref(av_stream_->frame_); // Unref old data so the frame can be reused
av_frame_unref(av_stream_->sw_frame_);
}
}
av_packet_unref(av_stream_->pkt_);
}
av_stream_->init_data_storage(opts_);

return av_stream_->stream_data_;
} else {
// Skip streams if not video for now
return data_dict{};
}
}

int
ffmpeg_decoder::read_callback(void *opaque, uint8_t *buf, int buf_size)
{
// C style function used by ffmpeg to read from memory buffer
// Read up to buf_size bytes from the resource accessed by the AVIOContext object
auto *bd = static_cast<fairseq2n::detail::buffer_data *>(opaque);
size_t temp_size = std::min(static_cast<size_t>(buf_size), bd->size);
if (temp_size > std::numeric_limits<int>::max()) {
bd->decoder->error_message_ = "Size is too large to fit in an int";
return AVERROR(EINVAL);
}
buf_size = static_cast<int>(temp_size);
if (buf_size <= 0)
return AVERROR_EOF;
memcpy(buf, bd->ptr, static_cast<size_t>(buf_size));
bd->ptr += buf_size;
bd->size -= static_cast<size_t>(buf_size);
return buf_size;
}

ffmpeg_decoder::~ffmpeg_decoder()
{
if (avio_ctx_ != nullptr) {
av_freep(&avio_ctx_->buffer);
av_freep(&avio_ctx_);
}
if (fmt_ctx_ != nullptr) {
avformat_free_context(fmt_ctx_);
}
}

} // namespace fairseq2n
Loading
Loading