Compare commits
No commits in common. "master" and "b2350" have entirely different histories.
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=13.0.0
|
||||
ARG CUDA_VERSION=12.3.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
|
|
@ -13,38 +13,28 @@ WORKDIR /app
|
|||
ARG CUDA_DOCKER_ARCH=all
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable cuBLAS
|
||||
ENV GGML_CUDA=1
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libsdl2-dev wget cmake git \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
# Ref: https://stackoverflow.com/a/53464012
|
||||
ENV CUDA_MAIN_VERSION=13.0
|
||||
ENV CUDA_MAIN_VERSION=12.3
|
||||
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
|
||||
|
||||
COPY .. .
|
||||
# Enable cuBLAS
|
||||
RUN make base.en CMAKE_ARGS="-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES='75;80;86;90'"
|
||||
|
||||
RUN find /app/build -name "*.o" -delete && \
|
||||
find /app/build -name "*.a" -delete && \
|
||||
rm -rf /app/build/CMakeFiles && \
|
||||
rm -rf /app/build/cmake_install.cmake && \
|
||||
rm -rf /app/build/_deps
|
||||
RUN make base.en
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
ENV CUDA_MAIN_VERSION=13.0
|
||||
ENV CUDA_MAIN_VERSION=12.3
|
||||
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg wget cmake git \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
RUN du -sh /app/*
|
||||
RUN find /app -type f -size +100M
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
|
|
|||
|
|
@ -1,28 +0,0 @@
|
|||
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libsdl2-dev wget cmake git \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY .. .
|
||||
# Enable SYCL
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
make base.en CMAKE_ARGS="-DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16}"
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc4.2.0
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
|
||||
# Target the MUSA runtime image
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libsdl2-dev wget cmake git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*
|
||||
|
||||
COPY .. .
|
||||
# Enable muBLAS
|
||||
RUN make base.en CMAKE_ARGS="-DGGML_MUSA=1"
|
||||
|
||||
RUN find /app/build -name "*.o" -delete && \
|
||||
find /app/build -name "*.a" -delete && \
|
||||
rm -rf /app/build/CMakeFiles && \
|
||||
rm -rf /app/build/cmake_install.cmake && \
|
||||
rm -rf /app/build/_deps
|
||||
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg wget cmake git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*
|
||||
|
||||
COPY --from=build /app/build/bin /app/build/bin
|
||||
COPY --from=build /app/samples /app/samples
|
||||
COPY --from=build /app/models /app/models
|
||||
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
FROM ubuntu:24.04 AS build
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential wget cmake git libvulkan-dev glslc \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY .. .
|
||||
RUN make base.en CMAKE_ARGS="-DGGML_VULKAN=1"
|
||||
|
||||
FROM ubuntu:24.04 AS runtime
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg libsdl2-dev wget cmake git libvulkan1 mesa-vulkan-drivers \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
|
@ -16,5 +16,4 @@ RUN apt-get update && \
|
|||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
|
|
|||
|
|
@ -1,3 +0,0 @@
|
|||
build*/
|
||||
.github/
|
||||
.devops/
|
||||
|
|
@ -13,10 +13,10 @@ jobs:
|
|||
ubuntu-22:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/setup-go@v6
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '^1.23'
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v4
|
||||
- run: |
|
||||
cd bindings/go
|
||||
make test
|
||||
|
|
|
|||
|
|
@ -1,11 +1,55 @@
|
|||
name: Bindings Tests (Ruby)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- bindings/ruby/**
|
||||
- src/**/*.c
|
||||
- src/**/*.cpp
|
||||
- src/**/*.h
|
||||
- src/**/*.m
|
||||
- src/**/*.metal
|
||||
- include/**/*.c
|
||||
- include/**/*.cpp
|
||||
- include/**/*.h
|
||||
- include/**/*.m
|
||||
- include/**/*.metal
|
||||
- ggml/**/*.c
|
||||
- ggml/**/*.cpp
|
||||
- ggml/**/*.h
|
||||
- ggml/**/*.m
|
||||
- ggml/**/*.metal
|
||||
- scripts/get-flags.mk
|
||||
- examples/common.h
|
||||
- examples/common.cpp
|
||||
- examples/common-whisper.h
|
||||
- examples/common-whisper.cpp
|
||||
- examples/stb_vorbis.c
|
||||
- examples/miniaudio.h
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths:
|
||||
- bindings/ruby/**
|
||||
- src/**/*.c
|
||||
- src/**/*.cpp
|
||||
- src/**/*.h
|
||||
- src/**/*.m
|
||||
- src/**/*.metal
|
||||
- include/**/*.c
|
||||
- include/**/*.cpp
|
||||
- include/**/*.h
|
||||
- include/**/*.m
|
||||
- include/**/*.metal
|
||||
- ggml/**/*.c
|
||||
- ggml/**/*.cpp
|
||||
- ggml/**/*.h
|
||||
- ggml/**/*.m
|
||||
- ggml/**/*.metal
|
||||
- scripts/get-flags.mk
|
||||
- examples/common.h
|
||||
- examples/common.cpp
|
||||
- examples/common-whisper.h
|
||||
- examples/common-whisper.cpp
|
||||
- examples/stb_vorbis.c
|
||||
- examples/miniaudio.h
|
||||
|
||||
jobs:
|
||||
ubuntu-22:
|
||||
|
|
@ -16,6 +60,6 @@ jobs:
|
|||
steps:
|
||||
- uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: '3.2'
|
||||
- uses: actions/checkout@v6
|
||||
ruby-version: '3.1'
|
||||
- uses: actions/checkout@v4
|
||||
- run: rake test
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -15,18 +15,16 @@ jobs:
|
|||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-musa", dockerfile: ".devops/main-musa.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-intel", dockerfile: ".devops/main-intel.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-vulkan", dockerfile: ".devops/main-vulkan.Dockerfile", platform: "linux/amd64" }
|
||||
#TODO: the cuda image keeps failing - disable for now
|
||||
# https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
|
||||
#- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -43,35 +41,21 @@ jobs:
|
|||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Free up disk space
|
||||
run: |
|
||||
sudo apt-get remove -y '^dotnet-.*' '^llvm-.*' '^mysql-.*' '^postgresql-.*'
|
||||
sudo apt-get autoremove -y
|
||||
sudo apt-get autoclean
|
||||
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
|
||||
docker system prune -af
|
||||
|
||||
df -h
|
||||
|
||||
- name: Generate tags
|
||||
id: tags
|
||||
run: |
|
||||
TAGS="ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
|
||||
if [ "${{ github.event_name }}" == "push" ]; then
|
||||
TAGS="$TAGS,ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
fi
|
||||
echo "tags=$TAGS" >> $GITHUB_OUTPUT
|
||||
- name: Build and push Docker image (versioned)
|
||||
if: github.event_name == 'push'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platform }}
|
||||
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
- name: Build and push Docker image (tagged)
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: ${{ matrix.config.platform }}
|
||||
tags: ${{ steps.tags.outputs.tags }}
|
||||
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
|
|
|||
|
|
@ -1,97 +0,0 @@
|
|||
name: Examples WASM
|
||||
on:
|
||||
push:
|
||||
branches: ["master"]
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: "pages"
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
deploy-wasm-github-pages:
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Pages
|
||||
uses: actions/configure-pages@v5
|
||||
|
||||
- name: Setup emsdk
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
|
||||
- name: Build WASM Examples
|
||||
# Enable for real build later in whisper.cpp
|
||||
run: |
|
||||
mkdir -p build-em && cd build-em
|
||||
emcmake cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||
make -j
|
||||
|
||||
- name: Create staging directory
|
||||
run: mkdir -p staging
|
||||
|
||||
- name: Create .nojekyll file in staging directory
|
||||
run: touch staging/.nojekyll
|
||||
|
||||
- name: Copy application files
|
||||
run: |
|
||||
build_dir=build-em/bin
|
||||
|
||||
ls ${build_dir}
|
||||
|
||||
# command.wasm
|
||||
target_dir=staging/command.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/command.wasm/{index.html,command.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libcommand.js ${target_dir}
|
||||
|
||||
# bench.wasm
|
||||
target_dir=staging/bench.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/bench.wasm/{index.html,bench.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libbench.js ${target_dir}
|
||||
|
||||
# stream.wasm
|
||||
target_dir=staging/stream.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/stream.wasm/{index.html,stream.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libstream.js ${target_dir}
|
||||
|
||||
# wchess.wasm
|
||||
target_dir=staging/wchess.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp -r ${build_dir}/wchess.wasm/{index.html,css,img,js} ${target_dir}
|
||||
cp ${build_dir}/wchess.wasm.js ${target_dir}
|
||||
|
||||
# whisper.wasm (this will be the main example page)
|
||||
target_dir=staging
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/whisper.wasm/{index.html,main.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libmain.js ${target_dir}
|
||||
|
||||
# Copy Cross-Origin Isolation service worker
|
||||
cp -v examples/coi-serviceworker.js staging/
|
||||
|
||||
- name: List files in staging directory (for debugging)
|
||||
run: |
|
||||
echo "Files in staging directory:"
|
||||
find staging -type f | sort
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v4
|
||||
with:
|
||||
path: ./staging
|
||||
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v4
|
||||
|
|
@ -17,7 +17,7 @@ jobs:
|
|||
node-version: [ 16.x, 18.x ]
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
|
|
@ -27,7 +27,7 @@ jobs:
|
|||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v6
|
||||
uses: actions/setup-node@v1
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
cache: 'npm'
|
||||
|
|
|
|||
|
|
@ -14,8 +14,6 @@
|
|||
|
||||
build/
|
||||
build-*/
|
||||
build_*/
|
||||
tmp/
|
||||
|
||||
# SPM
|
||||
.build/
|
||||
|
|
@ -51,8 +49,6 @@ extra/bench-gg.txt
|
|||
models/*.mlmodel
|
||||
models/*.mlmodelc
|
||||
models/*.mlpackage
|
||||
models/*-encoder-openvino.xml
|
||||
models/*-encoder-openvino-cache/
|
||||
bindings/java/.gradle/
|
||||
bindings/java/.idea/
|
||||
.idea/
|
||||
|
|
@ -63,4 +59,4 @@ cmake-build-debug/
|
|||
.gradle/
|
||||
local.properties
|
||||
.log
|
||||
.exe
|
||||
.exe
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
||||
project("whisper.cpp" C CXX)
|
||||
project("whisper.cpp" VERSION 1.8.4)
|
||||
project("whisper.cpp" VERSION 1.7.4)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(SOVERSION 1)
|
||||
|
|
@ -34,9 +34,6 @@ endif()
|
|||
if (EMSCRIPTEN)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
|
||||
|
||||
# TODO: without these, we get the following error:
|
||||
|
|
@ -62,6 +59,9 @@ option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
|||
# option list
|
||||
#
|
||||
|
||||
# general
|
||||
option(WHISPER_CCACHE "whisper: use ccache if available" ON)
|
||||
|
||||
# debug
|
||||
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
|
||||
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
|
||||
|
|
@ -96,6 +96,7 @@ option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
|
|||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
# override ggml options
|
||||
set(GGML_CCACHE ${WHISPER_CCACHE})
|
||||
set(GGML_SANITIZE_THREAD ${WHISPER_SANITIZE_THREAD})
|
||||
set(GGML_SANITIZE_ADDRESS ${WHISPER_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
|
||||
|
|
@ -120,12 +121,6 @@ whisper_option_depr(WARNING WHISPER_OPENMP GGML_OPENMP)
|
|||
whisper_option_depr(WARNING WHISPER_RPC GGML_RPC)
|
||||
whisper_option_depr(WARNING WHISPER_SYCL GGML_SYCL)
|
||||
whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
|
||||
whisper_option_depr(WARNING WHISPER_CCACHE GGML_CCACHE)
|
||||
|
||||
if (GGML_CUDA AND NOT MSVC)
|
||||
#GGML_CUDA enabled, add the necessary compile options -Wno-deprecated-gpu-targets
|
||||
add_compile_options(-Wno-deprecated-gpu-targets)
|
||||
endif()
|
||||
|
||||
#
|
||||
# build the library
|
||||
|
|
@ -140,22 +135,6 @@ if (NOT TARGET ggml)
|
|||
add_library(ggml ALIAS ggml::ggml)
|
||||
else()
|
||||
add_subdirectory(ggml)
|
||||
if(WIN32)
|
||||
# The following adds a _DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR macro and is a workaround for
|
||||
# the Windows C++ standard library which does not support constexpr mutexes.
|
||||
# From the release notes://github.com/microsoft/STL/wiki/Changelog
|
||||
# Disable constexpr mutex constructor on Windows
|
||||
# Fixed mutex's constructor to be constexpr. #3824 #4000 #4339
|
||||
# Note: Programs that aren't following the documented restrictions on binary compatibility may encounter
|
||||
# null dereferences in mutex machinery. You must follow this rule:
|
||||
# When you mix binaries built by different supported versions of the toolset, the Redistributable version
|
||||
# must be at least as new as the latest toolset used by any app component.
|
||||
# You can define _DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR as an escape hatch.
|
||||
#
|
||||
# Specifically to whisper.cpp this would cause a crash when using the Java bindings.
|
||||
# resulting in a Invalid memory access error.
|
||||
target_compile_definitions(ggml-base PRIVATE _DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
|
||||
endif()
|
||||
endif()
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
|
|
@ -181,10 +160,6 @@ get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
|
|||
set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
|
||||
install(TARGETS whisper LIBRARY PUBLIC_HEADER)
|
||||
|
||||
target_compile_definitions(whisper PRIVATE
|
||||
WHISPER_VERSION="${PROJECT_VERSION}"
|
||||
)
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
|
||||
|
|
@ -215,44 +190,10 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
|
|||
#
|
||||
|
||||
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
#include(CTest)
|
||||
#add_subdirectory(tests)
|
||||
endif ()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
|
||||
if (MSVC)
|
||||
set(MSVC_WARNING_FLAGS
|
||||
/wd4101 # Unreferenced local variable
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4065 # switch statement contains 'default' but no 'case' labels
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4244 # Conversion from one type to another type, possible loss of ata
|
||||
/wd4805 # Unsafe mix of type
|
||||
/wd4305 # Truncation from 'type1' to 'type2' (often double to float)
|
||||
/wd4996 # Function or variable may be unsafe/deprecated
|
||||
)
|
||||
function(disable_msvc_warnings target_name)
|
||||
if(TARGET ${target_name})
|
||||
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
disable_msvc_warnings(whisper)
|
||||
disable_msvc_warnings(common)
|
||||
disable_msvc_warnings(common-sdl)
|
||||
disable_msvc_warnings(lsp)
|
||||
disable_msvc_warnings(wchess-core)
|
||||
disable_msvc_warnings(whisper-command)
|
||||
disable_msvc_warnings(whisper-cli)
|
||||
disable_msvc_warnings(whisper-server)
|
||||
disable_msvc_warnings(whisper-stream)
|
||||
disable_msvc_warnings(whisper-talk-llama)
|
||||
disable_msvc_warnings(whisper-bench)
|
||||
disable_msvc_warnings(quantize)
|
||||
disable_msvc_warnings(vad-speech-segments)
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
|||
2
LICENSE
2
LICENSE
|
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023-2026 The ggml authors
|
||||
Copyright (c) 2023-2024 The ggml authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
|||
8
Makefile
8
Makefile
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
.PHONY: build
|
||||
build:
|
||||
cmake -B build $(CMAKE_ARGS)
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
|
||||
# download a few audio samples into folder "./samples":
|
||||
|
|
@ -41,17 +41,17 @@ samples:
|
|||
|
||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo:
|
||||
bash ./models/download-ggml-model.sh $@
|
||||
cmake -B build $(CMAKE_ARGS)
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
@echo ""
|
||||
@echo "==============================================="
|
||||
@echo "Running $@ on all samples in ./samples ..."
|
||||
@echo "==============================================="
|
||||
@echo ""
|
||||
@for f in samples/*.{flac,mp3,ogg,wav}; do \
|
||||
@for f in samples/*$(.flac .mp3 .ogg .wav); do \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "" ; \
|
||||
./build/bin/whisper-cli -m models/ggml-$@.bin -f $$f ; \
|
||||
echo "" ; \
|
||||
|
|
|
|||
270
README.md
270
README.md
|
|
@ -2,12 +2,15 @@
|
|||
|
||||

|
||||
|
||||
[](https://github.com/ggml-org/whisper.cpp/actions)
|
||||
[](https://github.com/ggerganov/whisper.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://conan.io/center/whisper-cpp)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Stable: [v1.8.1](https://github.com/ggml-org/whisper.cpp/releases/tag/v1.8.1) / [Roadmap](https://github.com/orgs/ggml-org/projects/4/)
|
||||
> [!NOTE]
|
||||
> New maintenance roadmap: https://github.com/ggerganov/whisper.cpp/discussions/2788
|
||||
|
||||
Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
|
|
@ -23,9 +26,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
|
|||
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
|
||||
- [OpenVINO Support](#openvino-support)
|
||||
- [Ascend NPU Support](#ascend-npu-support)
|
||||
- [Moore Threads GPU Support](#moore-threads-gpu-support)
|
||||
- [C-style API](https://github.com/ggml-org/whisper.cpp/blob/master/include/whisper.h)
|
||||
- [Voice Activity Detection (VAD)](#voice-activity-detection-vad)
|
||||
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
|
||||
|
||||
Supported platforms:
|
||||
|
||||
|
|
@ -33,14 +34,14 @@ Supported platforms:
|
|||
- [x] [iOS](examples/whisper.objc)
|
||||
- [x] [Android](examples/whisper.android)
|
||||
- [x] [Java](bindings/java/README.md)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggml-org/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggerganov/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] [WebAssembly](examples/whisper.wasm)
|
||||
- [x] Windows ([MSVC](https://github.com/ggml-org/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggml-org/whisper.cpp/issues/168))
|
||||
- [x] [Raspberry Pi](https://github.com/ggml-org/whisper.cpp/discussions/166)
|
||||
- [x] [Docker](https://github.com/ggml-org/whisper.cpp/pkgs/container/whisper.cpp)
|
||||
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
|
||||
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
|
||||
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
|
||||
|
||||
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
|
||||
The rest of the code is part of the [`ggml`](https://github.com/ggml-org/ggml) machine learning library.
|
||||
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
|
||||
|
||||
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
|
||||
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
|
||||
|
|
@ -53,14 +54,14 @@ https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a
|
|||
|
||||
On Apple Silicon, the inference runs fully on the GPU via Metal:
|
||||
|
||||
https://github.com/ggml-org/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
|
||||
https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
|
||||
|
||||
## Quick start
|
||||
|
||||
First clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
```
|
||||
|
||||
Navigate into the directory:
|
||||
|
|
@ -80,7 +81,7 @@ Now build the [whisper-cli](examples/cli) example and transcribe an audio file l
|
|||
```bash
|
||||
# build the project
|
||||
cmake -B build
|
||||
cmake --build build -j --config Release
|
||||
cmake --build build --config Release
|
||||
|
||||
# transcribe an audio file
|
||||
./build/bin/whisper-cli -f samples/jfk.wav
|
||||
|
|
@ -149,9 +150,8 @@ standard cmake setup with:
|
|||
```bash
|
||||
# build with GGML_BLAS defined
|
||||
cmake -B build -DGGML_BLAS=1
|
||||
cmake --build build -j --config Release
|
||||
cmake --build build --config Release
|
||||
./build/bin/whisper-cli [ .. etc .. ]
|
||||
```
|
||||
|
||||
## Quantization
|
||||
|
||||
|
|
@ -163,7 +163,7 @@ Here are the steps for creating and using a quantized model:
|
|||
```bash
|
||||
# quantize a model with Q5_0 method
|
||||
cmake -B build
|
||||
cmake --build build -j --config Release
|
||||
cmake --build build --config Release
|
||||
./build/bin/quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
|
||||
|
||||
# run the examples as usual, specifying the quantized model file
|
||||
|
|
@ -225,7 +225,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
|
|||
The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
|
||||
Next runs are faster.
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggml-org/whisper.cpp/pull/566).
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
||||
|
||||
## OpenVINO support
|
||||
|
||||
|
|
@ -267,7 +267,7 @@ This can result in significant speedup in encoder performance. Here are the inst
|
|||
|
||||
- Build `whisper.cpp` with OpenVINO support:
|
||||
|
||||
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2024.6.0](https://github.com/openvinotoolkit/openvino/releases/tag/2024.6.0). Ready to use Binaries of the required libraries can be found in the [OpenVino Archives](https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/)
|
||||
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
|
||||
|
||||
After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
|
||||
|
||||
|
|
@ -310,7 +310,7 @@ This can result in significant speedup in encoder performance. Here are the inst
|
|||
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
|
||||
cached for the next run.
|
||||
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
|
||||
|
||||
## NVIDIA GPU support
|
||||
|
||||
|
|
@ -324,12 +324,6 @@ cmake -B build -DGGML_CUDA=1
|
|||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
or for newer NVIDIA GPU's (RTX 5000 series):
|
||||
```
|
||||
cmake -B build -DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## Vulkan GPU support
|
||||
Cross-vendor solution which allows you to accelerate workload on your GPU.
|
||||
First, make sure your graphics card driver provides support for Vulkan API.
|
||||
|
|
@ -362,7 +356,6 @@ First, check if your Ascend NPU device is supported:
|
|||
| Ascend NPU | Status |
|
||||
|:-----------------------------:|:-------:|
|
||||
| Atlas 300T A2 | Support |
|
||||
| Atlas 300I Duo | Support |
|
||||
|
||||
Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
|
||||
|
||||
|
|
@ -384,56 +377,6 @@ Run the inference examples as usual, for example:
|
|||
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
|
||||
- If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
|
||||
|
||||
## Moore Threads GPU support
|
||||
|
||||
With Moore Threads cards the processing of the models is done efficiently on the GPU via muBLAS and custom MUSA kernels.
|
||||
First, make sure you have installed `MUSA SDK rc4.2.0`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=4.2.0
|
||||
|
||||
Now build `whisper.cpp` with MUSA support:
|
||||
|
||||
```
|
||||
cmake -B build -DGGML_MUSA=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
or specify the architecture for your Moore Threads GPU. For example, if you have a MTT S80 GPU, you can specify the architecture as follows:
|
||||
|
||||
```
|
||||
cmake -B build -DGGML_MUSA=1 -DMUSA_ARCHITECTURES="21"
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## FFmpeg support (Linux only)
|
||||
|
||||
If you want to support more audio formats (such as Opus and AAC), you can turn on the `WHISPER_FFMPEG` build flag to enable FFmpeg integration.
|
||||
|
||||
First, you need to install required libraries:
|
||||
|
||||
```bash
|
||||
# Debian/Ubuntu
|
||||
sudo apt install libavcodec-dev libavformat-dev libavutil-dev
|
||||
|
||||
# RHEL/Fedora
|
||||
sudo dnf install libavcodec-free-devel libavformat-free-devel libavutil-free-devel
|
||||
```
|
||||
|
||||
Then you can build the project as follows:
|
||||
|
||||
```bash
|
||||
cmake -B build -D WHISPER_FFMPEG=yes
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
Run the following example to confirm it's working:
|
||||
|
||||
```bash
|
||||
# Convert an audio file to Opus format
|
||||
ffmpeg -i samples/jfk.wav jfk.opus
|
||||
|
||||
# Transcribe the audio file
|
||||
./build/bin/whisper-cli --model models/ggml-base.en.bin --file jfk.opus
|
||||
```
|
||||
|
||||
## Docker
|
||||
|
||||
### Prerequisites
|
||||
|
|
@ -443,12 +386,10 @@ ffmpeg -i samples/jfk.wav jfk.opus
|
|||
|
||||
### Images
|
||||
|
||||
We have multiple Docker images available for this project:
|
||||
We have two Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggml-org/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggml-org/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
3. `ghcr.io/ggml-org/whisper.cpp:main-musa`: Same as `main` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
4. `ghcr.io/ggml-org/whisper.cpp:main-vulkan`: Same as `main` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
|
||||
### Usage
|
||||
|
||||
|
|
@ -457,27 +398,15 @@ We have multiple Docker images available for this project:
|
|||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "./models/download-ggml-model.sh base /models"
|
||||
|
||||
# transcribe an audio file
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
-v path/to/audios:/audios \
|
||||
whisper.cpp:main "whisper-cli -m /models/ggml-base.bin -f /audios/jfk.wav"
|
||||
|
||||
whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
|
||||
# transcribe an audio file in samples folder
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "whisper-cli -m /models/ggml-base.bin -f ./samples/jfk.wav"
|
||||
|
||||
# run the web server
|
||||
docker run -it --rm -p "8080:8080" \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "whisper-server --host 127.0.0.1 -m /models/ggml-base.bin"
|
||||
|
||||
# run the bench too on the small.en model using 4 threads
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "whisper-bench -m /models/ggml-small.en.bin -t 4"
|
||||
whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
|
||||
```
|
||||
|
||||
## Installing with Conan
|
||||
|
|
@ -498,12 +427,12 @@ For detailed instructions on how to use Conan, please refer to the [Conan docume
|
|||
|
||||
This is a naive example of performing real-time inference on audio from your microphone.
|
||||
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
|
||||
More info is available in [issue #10](https://github.com/ggml-org/whisper.cpp/issues/10).
|
||||
You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installed for it to work properly.
|
||||
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
|
||||
You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installed for it to work properly.
|
||||
|
||||
```bash
|
||||
cmake -B build -DWHISPER_SDL2=ON
|
||||
cmake --build build -j --config Release
|
||||
cmake --build build --config Release
|
||||
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
|
||||
```
|
||||
|
||||
|
|
@ -587,7 +516,7 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
|
|||
|
||||
## Speaker segmentation via tinydiarize (experimental)
|
||||
|
||||
More information about this approach is available here: https://github.com/ggml-org/whisper.cpp/pull/1058
|
||||
More information about this approach is available here: https://github.com/ggerganov/whisper.cpp/pull/1058
|
||||
|
||||
Sample usage:
|
||||
|
||||
|
|
@ -614,7 +543,7 @@ main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 pr
|
|||
## Karaoke-style movie generation (experimental)
|
||||
|
||||
The [whisper-cli](examples/cli) example provides support for output of karaoke-style movies, where the
|
||||
currently pronounced word is highlighted. Use the `-owts` argument and run the generated bash script.
|
||||
currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
|
||||
This requires to have `ffmpeg` installed.
|
||||
|
||||
Here are a few _"typical"_ examples:
|
||||
|
|
@ -651,7 +580,7 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
|
|||
|
||||
## Video comparison of different models
|
||||
|
||||
Use the [scripts/bench-wts.sh](https://github.com/ggml-org/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
|
||||
Use the [scripts/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
|
||||
|
||||
```bash
|
||||
./scripts/bench-wts.sh samples/jfk.wav
|
||||
|
|
@ -668,7 +597,7 @@ In order to have an objective comparison of the performance of the inference acr
|
|||
use the [whisper-bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
|
||||
took to execute it. The results are summarized in the following Github issue:
|
||||
|
||||
[Benchmark results](https://github.com/ggml-org/whisper.cpp/issues/89)
|
||||
[Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
|
||||
|
||||
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
|
||||
|
||||
|
|
@ -695,24 +624,25 @@ You can download the converted models using the [models/download-ggml-model.sh](
|
|||
or manually from here:
|
||||
|
||||
- https://huggingface.co/ggerganov/whisper.cpp
|
||||
- https://ggml.ggerganov.com
|
||||
|
||||
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).
|
||||
|
||||
## [Bindings](https://github.com/ggml-org/whisper.cpp/discussions/categories/bindings)
|
||||
## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)
|
||||
|
||||
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggml-org/whisper.cpp/discussions/310)
|
||||
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggml-org/whisper.cpp/discussions/309)
|
||||
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
|
||||
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
|
||||
- React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
|
||||
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggml-org/whisper.cpp/discussions/312)
|
||||
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
- [x] Java:
|
||||
- [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
|
||||
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggml-org/whisper.cpp/discussions/507)
|
||||
- [x] Objective-C / Swift: [ggml-org/whisper.spm](https://github.com/ggml-org/whisper.spm) | [#313](https://github.com/ggml-org/whisper.cpp/discussions/313)
|
||||
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
|
||||
- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
|
||||
- [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
|
||||
- [x] .NET: | [#422](https://github.com/ggml-org/whisper.cpp/discussions/422)
|
||||
- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
|
||||
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
|
||||
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
|
||||
- [x] Python: | [#9](https://github.com/ggml-org/whisper.cpp/issues/9)
|
||||
- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
|
||||
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
|
||||
- [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
|
||||
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
|
||||
|
|
@ -720,118 +650,6 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
|
|||
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
|
||||
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
|
||||
|
||||
## XCFramework
|
||||
The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
|
||||
and macOS. It can be used in Swift projects without the need to compile the
|
||||
library from source. For example, the v1.7.5 version of the XCFramework can be
|
||||
used as follows:
|
||||
|
||||
```swift
|
||||
// swift-tools-version: 5.10
|
||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "Whisper",
|
||||
targets: [
|
||||
.executableTarget(
|
||||
name: "Whisper",
|
||||
dependencies: [
|
||||
"WhisperFramework"
|
||||
]),
|
||||
.binaryTarget(
|
||||
name: "WhisperFramework",
|
||||
url: "https://github.com/ggml-org/whisper.cpp/releases/download/v1.7.5/whisper-v1.7.5-xcframework.zip",
|
||||
checksum: "c7faeb328620d6012e130f3d705c51a6ea6c995605f2df50f6e1ad68c59c6c4a"
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Voice Activity Detection (VAD)
|
||||
Support for Voice Activity Detection (VAD) can be enabled using the `--vad`
|
||||
argument to `whisper-cli`. In addition to this option a VAD model is also
|
||||
required.
|
||||
|
||||
The way this works is that first the audio samples are passed through
|
||||
the VAD model which will detect speech segments. Using this information,
|
||||
only the speech segments that are detected are extracted from the original audio
|
||||
input and passed to whisper for processing. This reduces the amount of audio
|
||||
data that needs to be processed by whisper and can significantly speed up the
|
||||
transcription process.
|
||||
|
||||
The following VAD models are currently supported:
|
||||
|
||||
### Silero-VAD
|
||||
[Silero-vad](https://github.com/snakers4/silero-vad) is a lightweight VAD model
|
||||
written in Python that is fast and accurate.
|
||||
|
||||
Models can be downloaded by running the following command on Linux or MacOS:
|
||||
```console
|
||||
$ ./models/download-vad-model.sh silero-v6.2.0
|
||||
Downloading ggml model silero-v6.2.0 from 'https://huggingface.co/ggml-org/whisper-vad' ...
|
||||
ggml-silero-v6.2.0.bin 100%[==============================================>] 864.35K --.-KB/s in 0.04s
|
||||
Done! Model 'silero-v6.2.0' saved in '/path/models/ggml-silero-v6.2.0.bin'
|
||||
You can now use it like this:
|
||||
|
||||
$ ./build/bin/whisper-cli -vm /path/models/ggml-silero-v6.2.0.bin --vad -f samples/jfk.wav -m models/ggml-base.en.bin
|
||||
|
||||
```
|
||||
And the following command on Windows:
|
||||
```console
|
||||
> .\models\download-vad-model.cmd silero-v6.2.0
|
||||
Downloading vad model silero-v6.2.0...
|
||||
Done! Model silero-v6.2.0 saved in C:\Users\danie\work\ai\whisper.cpp\ggml-silero-v6.2.0.bin
|
||||
You can now use it like this:
|
||||
|
||||
C:\path\build\bin\Release\whisper-cli.exe -vm C:\path\ggml-silero-v6.2.0.bin --vad -m models/ggml-base.en.bin -f samples\jfk.wav
|
||||
|
||||
```
|
||||
|
||||
To see a list of all available models, run the above commands without any
|
||||
arguments.
|
||||
|
||||
This model can be also be converted manually to ggml using the following command:
|
||||
```console
|
||||
$ python3 -m venv venv && source venv/bin/activate
|
||||
$ (venv) pip install silero-vad
|
||||
$ (venv) $ python models/convert-silero-vad-to-ggml.py --output models/silero.bin
|
||||
Saving GGML Silero-VAD model to models/silero-v6.2.0-ggml.bin
|
||||
```
|
||||
And it can then be used with whisper as follows:
|
||||
```console
|
||||
$ ./build/bin/whisper-cli \
|
||||
--file ./samples/jfk.wav \
|
||||
--model ./models/ggml-base.en.bin \
|
||||
--vad \
|
||||
--vad-model ./models/silero-v6.2.0-ggml.bin
|
||||
```
|
||||
|
||||
### VAD Options
|
||||
|
||||
* --vad-threshold: Threshold probability for speech detection. A probability
|
||||
for a speech segment/frame above this threshold will be considered as speech.
|
||||
|
||||
* --vad-min-speech-duration-ms: Minimum speech duration in milliseconds. Speech
|
||||
segments shorter than this value will be discarded to filter out brief noise or
|
||||
false positives.
|
||||
|
||||
* --vad-min-silence-duration-ms: Minimum silence duration in milliseconds. Silence
|
||||
periods must be at least this long to end a speech segment. Shorter silence
|
||||
periods will be ignored and included as part of the speech.
|
||||
|
||||
* --vad-max-speech-duration-s: Maximum speech duration in seconds. Speech segments
|
||||
longer than this will be automatically split into multiple segments at silence
|
||||
points exceeding 98ms to prevent excessively long segments.
|
||||
|
||||
* --vad-speech-pad-ms: Speech padding in milliseconds. Adds this amount of padding
|
||||
before and after each detected speech segment to avoid cutting off speech edges.
|
||||
|
||||
* --vad-samples-overlap: Amount of audio to extend from each speech segment into
|
||||
the next one, in seconds (e.g., 0.10 = 100ms overlap). This ensures speech isn't
|
||||
cut off abruptly between segments when they're concatenated together.
|
||||
|
||||
## Examples
|
||||
|
||||
There are various examples of using the library for different projects in the [examples](examples) folder.
|
||||
|
|
@ -850,13 +668,13 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
|
|||
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
|
||||
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
|
||||
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggml-org/whisper.cpp/issues/185) |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
|
||||
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
|
||||
| [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
|
||||
|
||||
## [Discussions](https://github.com/ggml-org/whisper.cpp/discussions)
|
||||
## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
|
||||
|
||||
If you have any kind of feedback about this project feel free to use the Discussions section and open a new topic.
|
||||
You can use the [Show and tell](https://github.com/ggml-org/whisper.cpp/discussions/categories/show-and-tell) category
|
||||
You can use the [Show and tell](https://github.com/ggerganov/whisper.cpp/discussions/categories/show-and-tell) category
|
||||
to share your own projects that use `whisper.cpp`. If you have a question, make sure to check the
|
||||
[Frequently asked questions (#126)](https://github.com/ggml-org/whisper.cpp/discussions/126) discussion.
|
||||
[Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126) discussion.
|
||||
|
|
|
|||
498
README_sycl.md
498
README_sycl.md
|
|
@ -1,249 +1,249 @@
|
|||
# whisper.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The whisper.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON
|
||||
|
||||
#for FP32
|
||||
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute whisper.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_whisper.sh
|
||||
```
|
||||
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
|
||||
|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
||||
# whisper.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The whisper.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON
|
||||
|
||||
#for FP32
|
||||
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute whisper.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_whisper.sh
|
||||
```
|
||||
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
|
||||
|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
||||
|
|
@ -15,7 +15,7 @@ BUILD_DIR := build_go
|
|||
MODELS_DIR := models
|
||||
EXAMPLES_DIR := $(wildcard examples/*)
|
||||
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
|
||||
LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src):$(abspath ../../${BUILD_DIR}/ggml/src)
|
||||
LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src:$(abspath ../../${BUILD_DIR}/ggml/src))
|
||||
|
||||
ifeq ($(GGML_CUDA),1)
|
||||
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
|
||||
|
|
@ -23,8 +23,7 @@ ifeq ($(GGML_CUDA),1)
|
|||
endif
|
||||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
LIBRARY_PATH := $(LIBRARY_PATH):$(abspath ../../${BUILD_DIR}/ggml/src/ggml-blas):$(abspath ../../${BUILD_DIR}/ggml/src/ggml-metal)
|
||||
EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit -lggml-metal -lggml-blas
|
||||
EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
|
||||
endif
|
||||
|
||||
all: clean whisper examples
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ func main() {
|
|||
In order to build, you need to have the Go compiler installed. You can get it from [here](https://golang.org/dl/). Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/go
|
||||
make test
|
||||
```
|
||||
|
|
@ -98,7 +98,7 @@ The API Documentation:
|
|||
|
||||
Getting help:
|
||||
|
||||
* Follow the discussion for the go bindings [here](https://github.com/ggml-org/whisper.cpp/discussions/312)
|
||||
* Follow the discussion for the go bindings [here](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
|
||||
## License
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
github.com/ggml-org/whisper.cpp/bindings/go
|
||||
github.com/ggerganov/whisper.cpp/bindings/go
|
||||
provides a speech-to-text service bindings for the Go programming language.
|
||||
*/
|
||||
package whisper
|
||||
|
|
|
|||
|
|
@ -282,20 +282,13 @@ func Download(ctx context.Context, p io.Writer, model, out string) (string, erro
|
|||
default:
|
||||
// Read body
|
||||
n, err := resp.Body.Read(data)
|
||||
if n > 0 {
|
||||
if m, err := w.Write(data[:n]); err != nil {
|
||||
return path, err
|
||||
} else {
|
||||
count += int64(m)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
DownloadReport(p, pct, count, resp.ContentLength)
|
||||
return path, nil
|
||||
}
|
||||
DownloadReport(p, pct, count, resp.ContentLength)
|
||||
return path, err
|
||||
} else if m, err := w.Write(data[:n]); err != nil {
|
||||
return path, err
|
||||
} else {
|
||||
count += int64(m)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,39 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
|
|||
p.print_timestamps = toBool(v)
|
||||
}
|
||||
|
||||
// Voice Activity Detection (VAD)
|
||||
func (p *Params) SetVAD(v bool) {
|
||||
p.vad = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetVADModelPath(path string) {
|
||||
p.vad_model_path = C.CString(path)
|
||||
}
|
||||
|
||||
func (p *Params) SetVADThreshold(t float32) {
|
||||
p.vad_params.threshold = C.float(t)
|
||||
}
|
||||
|
||||
func (p *Params) SetVADMinSpeechMs(ms int) {
|
||||
p.vad_params.min_speech_duration_ms = C.int(ms)
|
||||
}
|
||||
|
||||
func (p *Params) SetVADMinSilenceMs(ms int) {
|
||||
p.vad_params.min_silence_duration_ms = C.int(ms)
|
||||
}
|
||||
|
||||
func (p *Params) SetVADMaxSpeechSec(s float32) {
|
||||
p.vad_params.max_speech_duration_s = C.float(s)
|
||||
}
|
||||
|
||||
func (p *Params) SetVADSpeechPadMs(ms int) {
|
||||
p.vad_params.speech_pad_ms = C.int(ms)
|
||||
}
|
||||
|
||||
func (p *Params) SetVADSamplesOverlap(sec float32) {
|
||||
p.vad_params.samples_overlap = C.float(sec)
|
||||
}
|
||||
|
||||
// Set language id
|
||||
func (p *Params) SetLanguage(lang int) error {
|
||||
if lang == -1 {
|
||||
|
|
@ -179,10 +146,6 @@ func (p *Params) SetInitialPrompt(prompt string) {
|
|||
p.initial_prompt = C.CString(prompt)
|
||||
}
|
||||
|
||||
func (p *Params) SetCarryInitialPrompt(v bool) {
|
||||
p.carry_initial_prompt = toBool(v)
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PRIVATE METHODS
|
||||
|
||||
|
|
@ -236,9 +199,6 @@ func (p *Params) String() string {
|
|||
if p.token_timestamps {
|
||||
str += " token_timestamps"
|
||||
}
|
||||
if p.carry_initial_prompt {
|
||||
str += " carry_initial_prompt"
|
||||
}
|
||||
|
||||
return str + ">"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -80,39 +80,6 @@ func (context *context) SetTranslate(v bool) {
|
|||
context.params.SetTranslate(v)
|
||||
}
|
||||
|
||||
// Voice Activity Detection (VAD)
|
||||
func (context *context) SetVAD(v bool) {
|
||||
context.params.SetVAD(v)
|
||||
}
|
||||
|
||||
func (context *context) SetVADModelPath(path string) {
|
||||
context.params.SetVADModelPath(path)
|
||||
}
|
||||
|
||||
func (context *context) SetVADThreshold(t float32) {
|
||||
context.params.SetVADThreshold(t)
|
||||
}
|
||||
|
||||
func (context *context) SetVADMinSpeechMs(ms int) {
|
||||
context.params.SetVADMinSpeechMs(ms)
|
||||
}
|
||||
|
||||
func (context *context) SetVADMinSilenceMs(ms int) {
|
||||
context.params.SetVADMinSilenceMs(ms)
|
||||
}
|
||||
|
||||
func (context *context) SetVADMaxSpeechSec(s float32) {
|
||||
context.params.SetVADMaxSpeechSec(s)
|
||||
}
|
||||
|
||||
func (context *context) SetVADSpeechPadMs(ms int) {
|
||||
context.params.SetVADSpeechPadMs(ms)
|
||||
}
|
||||
|
||||
func (context *context) SetVADSamplesOverlap(sec float32) {
|
||||
context.params.SetVADSamplesOverlap(sec)
|
||||
}
|
||||
|
||||
func (context *context) SetSplitOnWord(v bool) {
|
||||
context.params.SetSplitOnWord(v)
|
||||
}
|
||||
|
|
@ -270,9 +237,6 @@ func (context *context) Process(
|
|||
return err
|
||||
}
|
||||
|
||||
// Reset n so that more Segments can be available within NextSegment call
|
||||
context.n = 0
|
||||
|
||||
// Return success
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -60,15 +60,6 @@ type Context interface {
|
|||
SetTemperature(t float32) // Set temperature
|
||||
SetTemperatureFallback(t float32) // Set temperature incrementation
|
||||
|
||||
SetVAD(v bool)
|
||||
SetVADModelPath(path string)
|
||||
SetVADThreshold(t float32)
|
||||
SetVADMinSpeechMs(ms int)
|
||||
SetVADMinSilenceMs(ms int)
|
||||
SetVADMaxSpeechSec(s float32)
|
||||
SetVADSpeechPadMs(ms int)
|
||||
SetVADSamplesOverlap(sec float32)
|
||||
|
||||
// Process mono audio data and return any errors.
|
||||
// If defined, newly generated segments are passed to the
|
||||
// callback function during processing.
|
||||
|
|
|
|||
|
|
@ -9,9 +9,7 @@ import (
|
|||
// CGO
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++
|
||||
#cgo linux LDFLAGS: -fopenmp
|
||||
#cgo darwin LDFLAGS: -lggml-metal -lggml-blas
|
||||
#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++ -fopenmp
|
||||
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
|
||||
#include <whisper.h>
|
||||
#include <stdlib.h>
|
||||
|
|
|
|||
|
|
@ -23,42 +23,26 @@ import io.github.ggerganov.whispercpp.WhisperCpp;
|
|||
public class Example {
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
WhisperCpp whisper = new WhisperCpp();
|
||||
// By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
|
||||
// or you can provide the absolute path to the model file.
|
||||
long context = whisper.initContext("base.en");
|
||||
try {
|
||||
// By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
|
||||
// or you can provide the absolute path to the model file.
|
||||
whisper.initContext("../ggml-base.en.bin");
|
||||
WhisperFullParams.ByValue whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
var whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
// custom configuration if required
|
||||
whisperParams.temperature_inc = 0f;
|
||||
|
||||
// custom configuration if required
|
||||
//whisperParams.n_threads = 8;
|
||||
whisperParams.temperature = 0.0f;
|
||||
whisperParams.temperature_inc = 0.2f;
|
||||
//whisperParams.language = "en";
|
||||
|
||||
float[] samples = readAudio(); // divide each value by 32767.0f
|
||||
List<WhisperSegment> whisperSegmentList = whisper.fullTranscribeWithTime(whisperParams, samples);
|
||||
var samples = readAudio(); // divide each value by 32767.0f
|
||||
whisper.fullTranscribe(whisperParams, samples);
|
||||
|
||||
for (WhisperSegment whisperSegment : whisperSegmentList) {
|
||||
|
||||
long start = whisperSegment.getStart();
|
||||
long end = whisperSegment.getEnd();
|
||||
|
||||
String text = whisperSegment.getSentence();
|
||||
|
||||
System.out.println("start: "+start);
|
||||
System.out.println("end: "+end);
|
||||
System.out.println("text: "+text);
|
||||
|
||||
int segmentCount = whisper.getTextSegmentCount(context);
|
||||
for (int i = 0; i < segmentCount; i++) {
|
||||
String text = whisper.getTextSegment(context, i);
|
||||
System.out.println(segment.getText());
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
whisper.close();
|
||||
whisper.freeContext(context);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
```
|
||||
|
|
@ -68,7 +52,7 @@ public class Example {
|
|||
In order to build, you need to have the JDK 8 or higher installed. Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/java
|
||||
|
||||
./gradlew build
|
||||
|
|
|
|||
|
|
@ -27,41 +27,23 @@ sourceSets {
|
|||
tasks.register('copyLibwhisperDynlib', Copy) {
|
||||
from '../../build/src'
|
||||
include 'libwhisper.dylib'
|
||||
into 'build/generated/resources/main'
|
||||
into 'build/generated/resources/main/darwin'
|
||||
}
|
||||
|
||||
tasks.register('copyLibwhisperSo', Copy) {
|
||||
from '../../build/src'
|
||||
include 'libwhisper.so'
|
||||
into 'build/generated/resources/main'
|
||||
into 'build/generated/resources/main/linux-x86-64'
|
||||
}
|
||||
|
||||
tasks.register('copyWhisperDLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
tasks.register('copyWhisperDll', Copy) {
|
||||
from '../../build/Release'
|
||||
include 'whisper.dll'
|
||||
into 'build/generated/resources/main'
|
||||
}
|
||||
|
||||
tasks.register('copyGGML_BASE_DLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
include 'ggml-base.dll'
|
||||
into 'build/generated/resources/main'
|
||||
}
|
||||
|
||||
tasks.register('copyGGML_DLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
include 'ggml.dll'
|
||||
into 'build/generated/resources/main'
|
||||
}
|
||||
|
||||
tasks.register('copyGGML_CPU_DLL', Copy) {
|
||||
from '../../build/bin/Release'
|
||||
include 'ggml-cpu.dll'
|
||||
into 'build/generated/resources/main'
|
||||
into 'build/generated/resources/main/windows-x86-64'
|
||||
}
|
||||
|
||||
tasks.register('copyLibs') {
|
||||
dependsOn copyLibwhisperDynlib, copyLibwhisperSo, copyWhisperDLL, copyGGML_BASE_DLL, copyGGML_DLL, copyGGML_CPU_DLL
|
||||
dependsOn copyLibwhisperDynlib, copyLibwhisperSo, copyWhisperDll
|
||||
}
|
||||
|
||||
test {
|
||||
|
|
|
|||
|
|
@ -168,26 +168,23 @@ public class WhisperCpp implements AutoCloseable {
|
|||
return str.toString().trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Full transcribe with time list.
|
||||
*
|
||||
* @param whisperParams the whisper params
|
||||
* @param audioData the audio data
|
||||
* @return the list
|
||||
* @throws IOException the io exception
|
||||
*/
|
||||
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
|
||||
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
||||
if (ctx == null) {
|
||||
throw new IllegalStateException("Model not initialised");
|
||||
}
|
||||
|
||||
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
||||
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
|
||||
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
|
||||
valueParams.read();
|
||||
|
||||
if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
|
||||
throw new IOException("Failed to process audio");
|
||||
}
|
||||
|
||||
int nSegments = lib.whisper_full_n_segments(ctx);
|
||||
List<WhisperSegment> segments= new ArrayList<>(nSegments);
|
||||
|
||||
|
||||
for (int i = 0; i < nSegments; i++) {
|
||||
long t0 = lib.whisper_full_get_segment_t0(ctx, i);
|
||||
String text = lib.whisper_full_get_segment_text(ctx, i);
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@ import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
|||
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
|
||||
|
||||
public interface WhisperCppJnaLibrary extends Library {
|
||||
|
||||
WhisperCppJnaLibrary instance = Native.load("whisper", WhisperCppJnaLibrary.class);
|
||||
|
||||
String whisper_print_system_info();
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ public class WhisperContextParams extends Structure {
|
|||
/** Use GPU for inference (default = true) */
|
||||
public CBool use_gpu;
|
||||
|
||||
/** Use flash attention (default = true) */
|
||||
/** Use flash attention (default = false) */
|
||||
public CBool flash_attn;
|
||||
|
||||
/** CUDA device to use (default = 0) */
|
||||
|
|
|
|||
|
|
@ -157,8 +157,6 @@ public class WhisperFullParams extends Structure {
|
|||
/** Tokens to provide to the whisper decoder as an initial prompt.
|
||||
* These are prepended to any existing text context from a previous call. */
|
||||
public String initial_prompt;
|
||||
/** Always prepend initial_prompt for every decode chunk. */
|
||||
public CBool carry_initial_prompt;
|
||||
|
||||
/** Prompt tokens. (int*) */
|
||||
public Pointer prompt_tokens;
|
||||
|
|
@ -338,8 +336,8 @@ public class WhisperFullParams extends Structure {
|
|||
"no_timestamps", "single_segment", "print_special",
|
||||
"print_progress", "print_realtime", "print_timestamps",
|
||||
"token_timestamps", "thold_pt", "thold_ptsum", "max_len",
|
||||
"split_on_word", "max_tokens", "debug_mode", "audio_ctx",
|
||||
"tdrz_enable", "suppress_regex", "initial_prompt", "carry_initial_prompt",
|
||||
"split_on_word", "max_tokens", "debug_mode", "audio_ctx",
|
||||
"tdrz_enable", "suppress_regex", "initial_prompt",
|
||||
"prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"suppress_blank", "suppress_nst", "temperature",
|
||||
"max_initial_ts", "length_penalty", "temperature_inc",
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||
|
||||
import io.github.ggerganov.whispercpp.bean.WhisperSegment;
|
||||
import io.github.ggerganov.whispercpp.params.CBool;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
|
|
@ -26,9 +25,7 @@ class WhisperCppTest {
|
|||
//String modelName = "../../models/ggml-tiny.bin";
|
||||
String modelName = "../../models/ggml-tiny.en.bin";
|
||||
try {
|
||||
WhisperContextParams.ByValue contextParams = whisper.getContextDefaultParams();
|
||||
contextParams.useFlashAttn(false); // Disable flash attention
|
||||
whisper.initContext(modelName, contextParams);
|
||||
whisper.initContext(modelName);
|
||||
//whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
//whisper.getJavaDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
modelInitialised = true;
|
||||
|
|
@ -121,7 +118,7 @@ class WhisperCppTest {
|
|||
float[] floats = new float[b.length / 2];
|
||||
|
||||
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
||||
params.print_progress = CBool.FALSE;
|
||||
//params.initial_prompt = "and so my fellow Americans um, like";
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.8.4",
|
||||
"version": "1.7.4",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
|
|
|||
|
|
@ -1,3 +0,0 @@
|
|||
README.md
|
||||
LICENSE
|
||||
sig
|
||||
|
|
@ -1,9 +1,3 @@
|
|||
LICENSE
|
||||
pkg/
|
||||
lib/whisper.*
|
||||
ext/examples/
|
||||
ext/ggml/
|
||||
ext/include/
|
||||
ext/scripts/
|
||||
ext/src/
|
||||
test/fixtures/
|
||||
|
|
|
|||
|
|
@ -1,2 +0,0 @@
|
|||
title: whispercpp
|
||||
main_page: README.md
|
||||
|
|
@ -5,6 +5,17 @@ whispercpp
|
|||
|
||||
Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Install the gem and add to the application's Gemfile by executing:
|
||||
|
||||
$ bundle add whispercpp
|
||||
|
||||
If bundler is not being used to manage dependencies, install the gem by executing:
|
||||
|
||||
$ gem install whispercpp
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
|
|
@ -20,8 +31,7 @@ params = Whisper::Params.new(
|
|||
max_text_tokens: 300,
|
||||
translate: true,
|
||||
print_timestamps: false,
|
||||
initial_prompt: "Initial prompt here.",
|
||||
carry_initial_prompt: true
|
||||
initial_prompt: "Initial prompt here."
|
||||
)
|
||||
|
||||
whisper.transcribe("path/to/audio.wav", params) do |whole_text|
|
||||
|
|
@ -34,6 +44,17 @@ end
|
|||
|
||||
Some models are prepared up-front:
|
||||
|
||||
```ruby
|
||||
base_en = Whisper::Model.pre_converted_models["base.en"]
|
||||
whisper = Whisper::Context.new(base_en)
|
||||
```
|
||||
|
||||
At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
|
||||
|
||||
```ruby
|
||||
Whisper::Model.pre_converted_models["base"].clear_cache
|
||||
```
|
||||
|
||||
You also can use shorthand for pre-converted models:
|
||||
|
||||
```ruby
|
||||
|
|
@ -58,19 +79,6 @@ puts Whisper::Model.pre_converted_models.keys
|
|||
# :
|
||||
```
|
||||
|
||||
You can also retrieve each model:
|
||||
|
||||
```ruby
|
||||
base_en = Whisper::Model.pre_converted_models["base.en"]
|
||||
whisper = Whisper::Context.new(base_en)
|
||||
```
|
||||
|
||||
At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
|
||||
|
||||
```ruby
|
||||
Whisper::Model.pre_converted_models["base"].clear_cache
|
||||
```
|
||||
|
||||
You can also use local model files you prepared:
|
||||
|
||||
```ruby
|
||||
|
|
@ -82,8 +90,7 @@ Or, you can download model files:
|
|||
```ruby
|
||||
whisper = Whisper::Context.new("https://example.net/uri/of/your/model.bin")
|
||||
# Or
|
||||
uri = URI("https://example.net/uri/of/your/model.bin")
|
||||
whisper = Whisper::Context.new(uri)
|
||||
whisper = Whisper::Context.new(URI("https://example.net/uri/of/your/model.bin"))
|
||||
```
|
||||
|
||||
See [models][] page for details.
|
||||
|
|
@ -92,118 +99,9 @@ See [models][] page for details.
|
|||
|
||||
Currently, whisper.cpp accepts only 16-bit WAV files.
|
||||
|
||||
### Voice Activity Detection (VAD) ###
|
||||
|
||||
Support for Voice Activity Detection (VAD) can be enabled by setting `Whisper::Params`'s `vad` argument to `true` and specifying VAD model:
|
||||
|
||||
```ruby
|
||||
Whisper::Params.new(
|
||||
vad: true,
|
||||
vad_model_path: "silero-v6.2.0",
|
||||
# other arguments...
|
||||
)
|
||||
```
|
||||
|
||||
When you pass the model name (`"silero-v6.2.0"`) or URI (`https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin`), it will be downloaded automatically.
|
||||
Currently, "silero-v6.2.0" is registered as pre-converted model like ASR models. You also specify file path or URI of model.
|
||||
|
||||
If you need configure VAD behavior, pass params for that:
|
||||
|
||||
```ruby
|
||||
Whisper::Params.new(
|
||||
vad: true,
|
||||
vad_model_path: "silero-v6.2.0",
|
||||
vad_params: Whisper::VAD::Params.new(
|
||||
threshold: 1.0, # defaults to 0.5
|
||||
min_speech_duration_ms: 500, # defaults to 250
|
||||
min_silence_duration_ms: 200, # defaults to 100
|
||||
max_speech_duration_s: 30000, # default is FLT_MAX,
|
||||
speech_pad_ms: 50, # defaults to 30
|
||||
samples_overlap: 0.5 # defaults to 0.1
|
||||
),
|
||||
# other arguments...
|
||||
)
|
||||
```
|
||||
|
||||
For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
|
||||
|
||||
### Output ###
|
||||
|
||||
whispercpp supports SRT and WebVTT output:
|
||||
|
||||
```ruby
|
||||
puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
|
||||
# =>
|
||||
WEBVTT
|
||||
|
||||
1
|
||||
00:00:00.000 --> 00:00:03.860
|
||||
My thought I have nobody by a beauty and will as you poured.
|
||||
|
||||
2
|
||||
00:00:03.860 --> 00:00:09.840
|
||||
Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
|
||||
|
||||
3
|
||||
00:00:09.840 --> 00:00:09.940
|
||||
a
|
||||
|
||||
```
|
||||
|
||||
You may call `#to_srt`, too
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Install the gem and add to the application's Gemfile by executing:
|
||||
|
||||
$ bundle add whispercpp
|
||||
|
||||
If bundler is not being used to manage dependencies, install the gem by executing:
|
||||
|
||||
$ gem install whispercpp
|
||||
|
||||
You can pass build options for whisper.cpp, for instance:
|
||||
|
||||
$ bundle config build.whispercpp --enable-ggml-cuda
|
||||
|
||||
or,
|
||||
|
||||
$ gem install whispercpp -- --enable-ggml-cuda
|
||||
|
||||
See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present in the README to Ruby-style options, for example:
|
||||
|
||||
Boolean options:
|
||||
|
||||
* `-DGGML_BLAS=1` -> `--enable-ggml-blas`
|
||||
* `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
|
||||
|
||||
Argument options:
|
||||
|
||||
* `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
|
||||
|
||||
Combination:
|
||||
|
||||
* `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
|
||||
|
||||
For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.
|
||||
For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.
|
||||
|
||||
API
|
||||
---
|
||||
|
||||
### Transcription ###
|
||||
|
||||
By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option:
|
||||
|
||||
```ruby
|
||||
whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
|
||||
```
|
||||
|
||||
Note that transcription occasionally might be low accuracy when it works in parallel.
|
||||
|
||||
If n_processors is greater than 1, you cannot set any callbacks including new_segment_callback, progress_callback, encoder_begin_callback, abort_callback, and log_callback set by Whisper.log_set.
|
||||
|
||||
### Segments ###
|
||||
|
||||
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
|
||||
|
|
@ -225,7 +123,7 @@ whisper
|
|||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_turn_next?
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
puts line
|
||||
end
|
||||
|
||||
|
|
@ -241,7 +139,7 @@ params.on_new_segment do |segment|
|
|||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_turn_next?
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
puts line
|
||||
end
|
||||
|
||||
|
|
@ -249,58 +147,6 @@ whisper.transcribe("path/to/audio.wav", params)
|
|||
|
||||
```
|
||||
|
||||
### Tokens ###
|
||||
|
||||
Each segment has tokens.
|
||||
|
||||
To enable token timestamps, you need to set `Whisper::Params#token_timestamps = true`. Then, retrieve tokens from segments using `Whisper::Segment#each_token`.
|
||||
|
||||
```ruby
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new(token_timestamps: true)
|
||||
whisper
|
||||
.transcribe("path/to/audio.wav", params)
|
||||
.each_segment do |segment|
|
||||
segment.each_token do |token|
|
||||
token => {start_time:, end_time:, text:, probability:}
|
||||
st = "%05.2fs" % (start_time / 1000.0)
|
||||
et = "%05.2fs" % (end_time / 1000.0)
|
||||
prob = "%.1f%%" % (probability * 100)
|
||||
puts "[#{st} --> #{et}] #{text} (#{prob})"
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
```
|
||||
[00.00s --> 00.00s] [_BEG_] (84.2%)
|
||||
[00.32s --> 00.37s] And (71.2%)
|
||||
[00.37s --> 00.53s] so (98.5%)
|
||||
[00.69s --> 00.85s] my (70.7%)
|
||||
[00.85s --> 01.59s] fellow (99.5%)
|
||||
[01.59s --> 02.10s] Americans (90.1%)
|
||||
[02.85s --> 03.30s] , (28.4%)
|
||||
[03.30s --> 04.14s] ask (79.8%)
|
||||
[04.14s --> 04.28s] not (78.9%)
|
||||
[05.03s --> 05.35s] what (93.3%)
|
||||
[05.41s --> 05.74s] your (98.8%)
|
||||
[05.74s --> 06.41s] country (99.6%)
|
||||
[06.41s --> 06.74s] can (97.7%)
|
||||
[06.74s --> 06.92s] do (99.0%)
|
||||
[07.00s --> 07.00s] for (95.8%)
|
||||
[07.01s --> 07.52s] you (98.5%)
|
||||
[07.81s --> 08.05s] , (49.3%)
|
||||
[08.19s --> 08.37s] ask (65.6%)
|
||||
[08.37s --> 08.75s] what (98.8%)
|
||||
[08.91s --> 09.04s] you (98.2%)
|
||||
[09.04s --> 09.32s] can (96.9%)
|
||||
[09.32s --> 09.38s] do (90.3%)
|
||||
[09.44s --> 09.76s] for (91.8%)
|
||||
[09.76s --> 09.99s] your (98.2%)
|
||||
[10.02s --> 10.36s] country (99.6%)
|
||||
[10.51s --> 10.99s] . (87.0%)
|
||||
[11.00s --> 11.00s] [_TT_550] (7.6%)
|
||||
```
|
||||
|
||||
### Models ###
|
||||
|
||||
You can see model information:
|
||||
|
|
@ -360,7 +206,7 @@ Whisper::Context.new("base")
|
|||
|
||||
### Low-level API to transcribe ###
|
||||
|
||||
You can also call `Whisper::Context#full` and `#full_parallel` with a Ruby array as samples. Although `#transcribe` with audio file path is recommended because it extracts PCM samples in C++ and is fast, `#full` and `#full_parallel` give you flexibility. Unlike `#transcribe`, these methods requires 16,000 Hz, 32-bit float audio.
|
||||
You can also call `Whisper::Context#full` and `#full_parallel` with a Ruby array as samples. Although `#transcribe` with audio file path is recommended because it extracts PCM samples in C++ and is fast, `#full` and `#full_parallel` give you flexibility.
|
||||
|
||||
```ruby
|
||||
require "whisper"
|
||||
|
|
@ -377,73 +223,12 @@ whisper
|
|||
end
|
||||
```
|
||||
|
||||
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView.
|
||||
|
||||
If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
|
||||
|
||||
```ruby
|
||||
require "torchaudio"
|
||||
require "ndav/torch/tensor"
|
||||
require "whisper"
|
||||
|
||||
waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav")
|
||||
# Convert Torch::Tensor to NDAV
|
||||
samples = waveform.squeeze.to_ndav
|
||||
|
||||
whisper = Whisper::Context.new("base")
|
||||
whisper
|
||||
# NDAV exports MemoryView
|
||||
.full(Whisper::Params.new, samples)
|
||||
```
|
||||
|
||||
Custom context params
|
||||
---------------------
|
||||
|
||||
You can use customize `Whisper::Context`'s behavior using `Whisper::Context::Params`.
|
||||
|
||||
```ruby
|
||||
context_params = Whisper::Context::Params.new(
|
||||
use_gpu: false,
|
||||
flash_attn: false,
|
||||
# etc
|
||||
)
|
||||
whisper = Whisper::Context.new("base", context_params)
|
||||
```
|
||||
|
||||
Using VAD separately from ASR
|
||||
-----------------------------
|
||||
|
||||
VAD feature itself is useful. You can use it separately from ASR:
|
||||
|
||||
```ruby
|
||||
vad = Whisper::VAD::Context.new("silero-v6.2.0")
|
||||
vad
|
||||
.detect("path/to/audio.wav", Whisper::VAD::Params.new)
|
||||
.each.with_index do |segment, index|
|
||||
segment => {start_time: st, end_time: ed} # `Segment` responds to `#deconstruct_keys`
|
||||
|
||||
puts "[%{nth}: %{st} --> %{ed}]" % {nth: index + 1, st:, ed:}
|
||||
end
|
||||
```
|
||||
|
||||
You may also low level API `Whisper::VAD::Context#segments_from_samples` as such `Whisper::Context#full`:
|
||||
|
||||
```ruby
|
||||
# Ruby Array
|
||||
reader = WaveFile::Reader.new("path/to/audio.wav", WaveFile::Format.new(:mono, :float, 16000))
|
||||
samples = reader.enum_for(:each_buffer).map(&:samples).flatten
|
||||
|
||||
# Or, object which exports MemoryView
|
||||
waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav")
|
||||
samples = waveform.squeeze.numo.to_arrow.to_arrow_array
|
||||
|
||||
segments = vad.segments_from_samples(Whisper::VAD::Params.new, samples)
|
||||
```
|
||||
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
|
||||
|
||||
Development
|
||||
-----------
|
||||
|
||||
% git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
% git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
% cd whisper.cpp/bindings/ruby
|
||||
% rake test
|
||||
|
||||
|
|
@ -451,15 +236,10 @@ First call of `rake test` builds an extension and downloads a model for testing.
|
|||
|
||||
If something seems wrong on build, running `rake clean` solves some cases.
|
||||
|
||||
### Need help ###
|
||||
|
||||
* Windows support
|
||||
* Refinement of C/C++ code, especially memory management
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
The same to [whisper.cpp][].
|
||||
|
||||
[whisper.cpp]: https://github.com/ggml-org/whisper.cpp
|
||||
[models]: https://github.com/ggml-org/whisper.cpp/tree/master/models
|
||||
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
|
||||
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
|
||||
|
|
|
|||
|
|
@ -3,25 +3,22 @@ require "bundler/gem_tasks"
|
|||
require "rake/testtask"
|
||||
require_relative "extsources"
|
||||
|
||||
SOURCES_DIR = "ext/sources"
|
||||
|
||||
SOURCES = FileList[]
|
||||
|
||||
EXTSOURCES.each do |src|
|
||||
basename = src.pathmap("%f")
|
||||
dest = basename == "LICENSE" ? basename
|
||||
: src.pathmap("%{\\.\\./\\.\\.,#{SOURCES_DIR}}p")
|
||||
.pathmap("%{\\.\\./javascript,#{SOURCES_DIR}/bindings/javascript}p")
|
||||
dest = basename == "LICENSE" ? basename : src.pathmap("%{../..,ext}p")
|
||||
dir = dest.pathmap("%d")
|
||||
file src
|
||||
directory dir
|
||||
file dest => [src, dir] do |t|
|
||||
copy t.source, t.name
|
||||
cp t.source, t.name
|
||||
end
|
||||
SOURCES.include dest
|
||||
end
|
||||
|
||||
CLEAN.include SOURCES
|
||||
CLEAN.include FileList["ext/**/*.o", "ext/**/*.metal", "ext/**/*.tmp", "ext/whisper.{so,bundle,dll}"]
|
||||
|
||||
SRC = FileList["ext/*.{c,cpp,h}"]
|
||||
|
||||
|
|
@ -34,25 +31,11 @@ LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
|
|||
SO_FILE = File.join("ext", LIB_NAME)
|
||||
LIB_FILE = File.join("lib", LIB_NAME)
|
||||
|
||||
file "ext/Makefile" => SRC + SOURCES + FileList["ext/*.rb"] do |t|
|
||||
file "ext/Makefile" => SRC + ["ext/extconf.rb"] + SOURCES do |t|
|
||||
chdir "ext" do
|
||||
ruby "extconf.rb"
|
||||
end
|
||||
end
|
||||
if File.exist? "ext/Makefile"
|
||||
task :make_clean do
|
||||
cd "ext" do
|
||||
sh "make", "clean"
|
||||
end
|
||||
end
|
||||
task clean: :make_clean
|
||||
task :make_distclean do
|
||||
cd "ext" do
|
||||
sh "make", "distclean"
|
||||
end
|
||||
end
|
||||
task clobber: :make_distclean
|
||||
end
|
||||
|
||||
file SO_FILE => "ext/Makefile" do |t|
|
||||
chdir "ext" do
|
||||
|
|
@ -67,30 +50,17 @@ file LIB_FILE => [SO_FILE, "lib"] do |t|
|
|||
end
|
||||
CLEAN.include LIB_FILE
|
||||
|
||||
Rake::TestTask.new
|
||||
|
||||
TEST_FIXTURE_AUDIO = "test/fixtures/jfk.wav"
|
||||
TEST_FIXTURE_AUDIO_SRC = File.expand_path(File.join(__dir__, "..", "..", "samples", "jfk.wav"))
|
||||
TEST_FIXTURE_AUDIO_DIR = TEST_FIXTURE_AUDIO.pathmap("%d")
|
||||
directory TEST_FIXTURE_AUDIO_DIR
|
||||
if File.exist? TEST_FIXTURE_AUDIO_SRC
|
||||
file TEST_FIXTURE_AUDIO => [TEST_FIXTURE_AUDIO_SRC, TEST_FIXTURE_AUDIO_DIR] do |t|
|
||||
symlink t.source, t.name
|
||||
end
|
||||
else
|
||||
require "open-uri"
|
||||
file TEST_FIXTURE_AUDIO => TEST_FIXTURE_AUDIO_DIR do |t|
|
||||
File.write t.name, URI("https://github.com/ggml-org/whisper.cpp/raw/refs/heads/master/samples/jfk.wav").read
|
||||
end
|
||||
Rake::TestTask.new do |t|
|
||||
t.test_files = FileList["tests/test_*.rb"]
|
||||
end
|
||||
|
||||
TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
|
||||
file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t|
|
||||
chdir "test/jfk_reader" do
|
||||
TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
|
||||
file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
|
||||
chdir "tests/jfk_reader" do
|
||||
ruby "extconf.rb"
|
||||
sh "make"
|
||||
end
|
||||
end
|
||||
CLEAN.include TEST_MEMORY_VIEW
|
||||
CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
|
||||
|
||||
task test: [LIB_FILE, TEST_MEMORY_VIEW, TEST_FIXTURE_AUDIO]
|
||||
task test: [LIB_FILE, TEST_MEMORY_VIEW]
|
||||
|
|
|
|||
|
|
@ -2,8 +2,10 @@ Makefile
|
|||
whisper.so
|
||||
whisper.bundle
|
||||
whisper.dll
|
||||
scripts/get-flags.mk
|
||||
*.o
|
||||
*.a
|
||||
sources/*
|
||||
!sources/CMakeGraphVizOptions.cmake
|
||||
mkmf.log
|
||||
/*/**/*.c
|
||||
/*/**/*.cpp
|
||||
/*/**/*.h
|
||||
/*/**/*.m
|
||||
/*/**/*.metal
|
||||
|
|
|
|||
|
|
@ -0,0 +1,9 @@
|
|||
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-alloc.h \
|
||||
ggml/src/ggml-backend-impl.h \
|
||||
ggml/include/ggml-cpu.h \
|
||||
ggml/src/ggml-impl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
require "tsort"
|
||||
|
||||
class Dependencies
|
||||
include TSort
|
||||
|
||||
def initialize(cmake, options)
|
||||
@cmake = cmake
|
||||
@options = options
|
||||
@static_lib_shape = nil
|
||||
@nodes = {}
|
||||
@graph = Hash.new {|h, k| h[k] = []}
|
||||
|
||||
generate_dot
|
||||
parse_dot
|
||||
end
|
||||
|
||||
def libs
|
||||
tsort.filter_map {|node|
|
||||
label, shape = @nodes[node]
|
||||
if shape == @static_lib_shape
|
||||
label.gsub(/\\n\([^)]+\)/, '')
|
||||
else
|
||||
nil
|
||||
end
|
||||
}.reverse.collect {|lib| "#{prefix(lib)}#{lib}.#{RbConfig::CONFIG['LIBEXT']}"}
|
||||
end
|
||||
|
||||
def to_s
|
||||
libs.join(" ")
|
||||
end
|
||||
|
||||
def local_libs
|
||||
to_s
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def dot_path
|
||||
File.join(__dir__, "build", "whisper.cpp.dot")
|
||||
end
|
||||
|
||||
def generate_dot
|
||||
system @cmake, "-S", "sources", "-B", "build", *@options.graphviz_cmake_args, "--graphviz", dot_path, *@options, exception: true
|
||||
end
|
||||
|
||||
def parse_dot
|
||||
File.open(dot_path).each_line do |line|
|
||||
case line
|
||||
when /\[\s*label\s*=\s*"Static Library"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]/
|
||||
@static_lib_shape = $~[:shape]
|
||||
when /\A\s*"(?<node>\w+)"\s*\[\s*label\s*=\s*"(?<label>\S+)"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]\s*;\s*\z/
|
||||
node = $~[:node]
|
||||
label = $~[:label]
|
||||
shape = $~[:shape]
|
||||
@nodes[node] = [label, shape]
|
||||
when /\A\s*"(?<depender>\w+)"\s*->\s*"(?<dependee>\w+)"/
|
||||
depender = $~[:depender]
|
||||
dependee = $~[:dependee]
|
||||
@graph[depender] << dependee
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def prefix(lib)
|
||||
"lib"
|
||||
end
|
||||
|
||||
def tsort_each_node
|
||||
@nodes.each_key do |node|
|
||||
yield node
|
||||
end
|
||||
end
|
||||
|
||||
def tsort_each_child(node)
|
||||
@graph[node].each do |child|
|
||||
yield child
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
require_relative "dependencies"
|
||||
|
||||
class DependenciesForWindows < Dependencies
|
||||
def local_libs
|
||||
libs.collect {|lib| %|"#{lib_path(lib)}"|}.join(" ")
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def prefix(lib)
|
||||
lib.start_with?("ggml") ? "" : "lib"
|
||||
end
|
||||
|
||||
def lib_path(lib)
|
||||
File.join(__dir__, lib).tr("\\", "/")
|
||||
end
|
||||
end
|
||||
|
|
@ -1,35 +1,208 @@
|
|||
require "mkmf"
|
||||
require 'mkmf'
|
||||
|
||||
if RUBY_PLATFORM.match? /mswin|mingw|ucrt/
|
||||
require_relative "options_for_windows"
|
||||
require_relative "dependencies_for_windows"
|
||||
# need to use c++ compiler flags
|
||||
$CXXFLAGS << ' -std=c++17'
|
||||
|
||||
Opts = OptionsForWindows
|
||||
Deps = DependenciesForWindows
|
||||
$LDFLAGS << ' -lstdc++'
|
||||
|
||||
# Set to true when building binary gems
|
||||
if enable_config('static-stdlib', false)
|
||||
$LDFLAGS << ' -static-libgcc -static-libstdc++'
|
||||
end
|
||||
|
||||
if enable_config('march-tune-native', false)
|
||||
$CFLAGS << ' -march=native -mtune=native'
|
||||
$CXXFLAGS << ' -march=native -mtune=native'
|
||||
end
|
||||
|
||||
if ENV['WHISPER_METAL']
|
||||
$GGML_METAL ||= true
|
||||
$DEPRECATE_WARNING ||= true
|
||||
end
|
||||
|
||||
$UNAME_S = `uname -s`.chomp
|
||||
$UNAME_P = `uname -p`.chomp
|
||||
$UNAME_M = `uname -m`.chomp
|
||||
|
||||
if $UNAME_S == 'Darwin'
|
||||
unless ENV['GGML_NO_METAL']
|
||||
$GGML_METAL ||= true
|
||||
end
|
||||
$GGML_NO_OPENMP ||= true
|
||||
end
|
||||
|
||||
if $GGML_METAL
|
||||
$GGML_METAL_EMBED_LIBRARY = true
|
||||
end
|
||||
|
||||
$MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -Iexamples -DGGML_USE_CPU'
|
||||
$MK_CFLAGS = '-std=c11 -fPIC'
|
||||
$MK_CXXFLAGS = '-std=c++17 -fPIC'
|
||||
$MK_NVCCFLAGS = '-std=c++17'
|
||||
$MK_LDFLAGS = ''
|
||||
|
||||
$OBJ_GGML = []
|
||||
$OBJ_WHISPER = []
|
||||
$OBJ_COMMON = []
|
||||
$OBJ_SDL = []
|
||||
|
||||
$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
|
||||
|
||||
if $UNAME_S == 'Linux'
|
||||
$MK_CPPFLAGS << ' -D_GNU_SOURCE'
|
||||
end
|
||||
|
||||
if $UNAME_S == 'Darwin'
|
||||
$MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
|
||||
end
|
||||
|
||||
if ENV['WHISPER_DEBUG']
|
||||
$MK_CFLAGS << ' -O0 -g'
|
||||
$MK_CXXFLAGS << ' -O0 -g'
|
||||
$MK_LDFLAGS << ' -g'
|
||||
$MK_NVCCFLAGS << ' -O0 -g'
|
||||
else
|
||||
require_relative "options"
|
||||
require_relative "dependencies"
|
||||
|
||||
Opts = Options
|
||||
Deps = Dependencies
|
||||
$MK_CPPFLAGS << ' -DNDEBUG'
|
||||
$MK_CFLAGS << ' -O3'
|
||||
$MK_CXXFLAGS << ' -O3'
|
||||
$MK_NVCCFLAGS << ' -O3'
|
||||
end
|
||||
|
||||
cmake = find_executable("cmake") || abort
|
||||
options = Opts.new(cmake)
|
||||
have_library("gomp") rescue nil
|
||||
libs = Deps.new(cmake, options)
|
||||
$WARN_FLAGS =
|
||||
' -Wall' <<
|
||||
' -Wextra' <<
|
||||
' -Wpedantic' <<
|
||||
' -Wcast-qual' <<
|
||||
' -Wno-unused-function'
|
||||
|
||||
append_cflags ["-O3", "-march=native"]
|
||||
$INCFLAGS << " -Isources/include -Isources/ggml/include -Isources/examples"
|
||||
$LOCAL_LIBS << " #{libs.local_libs}"
|
||||
$cleanfiles << " build #{libs}"
|
||||
$MK_CFLAGS <<
|
||||
$WARN_FLAGS <<
|
||||
' -Wshadow' <<
|
||||
' -Wstrict-prototypes' <<
|
||||
' -Wpointer-arith' <<
|
||||
' -Wmissing-prototypes' <<
|
||||
' -Werror=implicit-int' <<
|
||||
' -Werror=implicit-function-declaration'
|
||||
|
||||
create_makefile "whisper" do |conf|
|
||||
conf << <<~EOF
|
||||
$(TARGET_SO): #{libs}
|
||||
#{libs}: cmake-targets
|
||||
cmake-targets:
|
||||
#{"\t"}"#{cmake}" -S sources -B build #{options}
|
||||
#{"\t"}"#{cmake}" --build build --config Release --target common whisper
|
||||
EOF
|
||||
$MK_CXXFLAGS <<
|
||||
$WARN_FLAGS <<
|
||||
' -Wmissing-declarations' <<
|
||||
' -Wmissing-noreturn'
|
||||
|
||||
unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
|
||||
$MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
|
||||
end
|
||||
|
||||
if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
|
||||
$MK_CFLAGS << ' -pthread'
|
||||
$MK_CXXFLAGS << ' -pthread'
|
||||
end
|
||||
|
||||
unless $_WIN32
|
||||
$DSO_EXT = '.so'
|
||||
else
|
||||
$DSO_EXT = '.dll'
|
||||
end
|
||||
|
||||
unless ENV['RISCV']
|
||||
if %w[x86_64 i686 amd64].include? $UNAME_M
|
||||
$HOST_CXXFLAGS ||= ''
|
||||
|
||||
$MK_CFLAGS << ' -march=native -mtune=native'
|
||||
$HOST_CXXFLAGS << ' -march=native -mtune=native'
|
||||
end
|
||||
else
|
||||
$MK_CFLAGS << ' -march=rv64gcv -mabi=lp64d'
|
||||
$MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
|
||||
end
|
||||
|
||||
unless ENV['GGML_NO_ACCELERATE']
|
||||
if $UNAME_S == 'Darwin'
|
||||
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE'
|
||||
$MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
|
||||
$MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
|
||||
$MK_LDFLAGS << ' -framework Accelerate'
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
end
|
||||
|
||||
if ENV['GGML_OPENBLAS']
|
||||
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
|
||||
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
|
||||
$MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
|
||||
if ENV['GGML_OPENBLAS64']
|
||||
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
|
||||
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
|
||||
$MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
|
||||
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
|
||||
end
|
||||
|
||||
if $GGML_METAL
|
||||
$MK_CPPFLAGS << ' -DGGML_USE_METAL'
|
||||
$MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
||||
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal.o'
|
||||
|
||||
if ENV['GGML_METAL_NDEBUG']
|
||||
$MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
|
||||
end
|
||||
|
||||
if $GGML_METAL_EMBED_LIBRARY
|
||||
$MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
|
||||
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal-embed.o'
|
||||
end
|
||||
end
|
||||
|
||||
$OBJ_GGML <<
|
||||
'ggml/src/ggml.o' <<
|
||||
'ggml/src/ggml-alloc.o' <<
|
||||
'ggml/src/ggml-backend.o' <<
|
||||
'ggml/src/ggml-backend-reg.o' <<
|
||||
'ggml/src/ggml-opt.o' <<
|
||||
'ggml/src/ggml-quants.o' <<
|
||||
'ggml/src/ggml-threading.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-cpp.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-traits.o'
|
||||
|
||||
$OBJ_WHISPER <<
|
||||
'src/whisper.o' <<
|
||||
'examples/common.o' <<
|
||||
'examples/common-whisper.o'
|
||||
|
||||
$objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
|
||||
$objs <<
|
||||
"ruby_whisper.o" <<
|
||||
"ruby_whisper_context.o" <<
|
||||
"ruby_whisper_transcribe.o" <<
|
||||
"ruby_whisper_params.o" <<
|
||||
"ruby_whisper_error.o" <<
|
||||
"ruby_whisper_segment.o" <<
|
||||
"ruby_whisper_model.o"
|
||||
|
||||
$CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
|
||||
$CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
|
||||
$BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
|
||||
$CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
|
||||
$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
|
||||
$LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
|
||||
|
||||
create_makefile('whisper')
|
||||
|
||||
File.open 'Makefile', 'a' do |file|
|
||||
file.puts 'include scripts/get-flags.mk'
|
||||
file.puts 'include cpu.mk'
|
||||
|
||||
if $GGML_METAL
|
||||
file.puts 'include metal.mk'
|
||||
|
||||
if $GGML_METAL_EMBED_LIBRARY
|
||||
file.puts 'include metal-embed.mk'
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
|||
|
|
@ -0,0 +1,17 @@
|
|||
ggml/src/ggml-metal/ggml-metal-embed.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.metal \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/src/ggml-common.h
|
||||
@echo "Embedding Metal library"
|
||||
@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
|
||||
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
|
||||
@rmdir ${TEMP_ASSEMBLY}
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
ggml/src/ggml-metal/ggml-metal.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.m \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/include/ggml-metal.h \
|
||||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
require "fileutils"
|
||||
|
||||
class Options
|
||||
def initialize(cmake="cmake")
|
||||
@cmake = cmake
|
||||
@options = {}
|
||||
|
||||
configure
|
||||
write_cache_file
|
||||
end
|
||||
|
||||
def to_a
|
||||
[
|
||||
"-D", "BUILD_SHARED_LIBS=OFF",
|
||||
"-D", "WHISPER_BUILD_TESTS=OFF",
|
||||
"-D", "CMAKE_ARCHIVE_OUTPUT_DIRECTORY=#{__dir__}",
|
||||
"-D", "CMAKE_POSITION_INDEPENDENT_CODE=ON",
|
||||
"-C", cache_path
|
||||
]
|
||||
end
|
||||
|
||||
def to_s
|
||||
command_line(*to_a)
|
||||
end
|
||||
|
||||
def graphviz_cmake_args
|
||||
[]
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def cmake_options
|
||||
@cmake_options ||= cmake_options_output.lines.drop_while {|line| line.chomp != "-- Cache values"}.drop(1)
|
||||
.filter_map {|line|
|
||||
option, value = line.chomp.split("=", 2)
|
||||
name, type = option.split(":", 2)
|
||||
[
|
||||
name,
|
||||
[
|
||||
type,
|
||||
type == "BOOL" ? value == "ON" : value
|
||||
]
|
||||
]
|
||||
}.to_h
|
||||
end
|
||||
|
||||
def cmake_options_output
|
||||
Dir.chdir(__dir__) do
|
||||
IO.popen([@cmake, "-S", "sources", "-B", "build", "-L"]) {|io| io.read}
|
||||
end
|
||||
end
|
||||
|
||||
def configure
|
||||
cmake_options.each_pair do |name, (type, default_value)|
|
||||
option = option_name(name)
|
||||
value = type == "BOOL" ? enable_config(option) : arg_config("--#{option}")
|
||||
@options[name] = [type, value]
|
||||
end
|
||||
|
||||
configure_accelerate
|
||||
configure_metal
|
||||
configure_coreml
|
||||
end
|
||||
|
||||
# See ggml/src/ggml-cpu/CMakeLists.txt
|
||||
def configure_accelerate
|
||||
if RUBY_PLATFORM.match?(/darwin/) && enabled?("GGML_ACCELERATE")
|
||||
$LDFLAGS << " -framework Accelerate"
|
||||
end
|
||||
end
|
||||
|
||||
# See ggml/src/ggml-metal/CMakeLists.txt
|
||||
def configure_metal
|
||||
$LDFLAGS << " -framework Foundation -framework Metal -framework MetalKit" if enabled?("GGML_METAL")
|
||||
end
|
||||
|
||||
# See src/CmakeLists.txt
|
||||
def configure_coreml
|
||||
if enabled?("WHISPER_COREML")
|
||||
$LDFLAGS << " -framework Foundation -framework CoreML"
|
||||
$defs << "-DRUBY_WHISPER_USE_COREML"
|
||||
end
|
||||
end
|
||||
|
||||
def option_name(name)
|
||||
name.downcase.gsub("_", "-")
|
||||
end
|
||||
|
||||
def enabled?(option)
|
||||
op = @options[option]
|
||||
return false unless op
|
||||
return false unless op[0] == "BOOL"
|
||||
if op[1].nil?
|
||||
cmake_options[option][1]
|
||||
else
|
||||
op[1]
|
||||
end
|
||||
end
|
||||
|
||||
def cache_path
|
||||
File.join(__dir__, "sources", "Options.cmake")
|
||||
end
|
||||
|
||||
def write_cache_file
|
||||
FileUtils.mkpath File.dirname(cache_path)
|
||||
File.open cache_path, "w" do |file|
|
||||
@options.reject {|name, (type, value)| value.nil?}.each do |name, (type, value)|
|
||||
line = "set(CACHE{%<name>s} TYPE %<type>s FORCE VALUE %<value>s)" % {
|
||||
name:,
|
||||
type:,
|
||||
value: value == true ? "ON" : value == false ? "OFF" : escape_cmake(value)
|
||||
}
|
||||
file.puts line
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def escape_cmake(str)
|
||||
str.gsub(/[\\"]/, '\\\\\&')
|
||||
end
|
||||
|
||||
def command_line(*args)
|
||||
args.collect {|arg| %|"#{arg.to_s.gsub(/[\\"]/, '\\\\\&')}"|}.join(" ")
|
||||
end
|
||||
end
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
require_relative "options"
|
||||
|
||||
class OptionsForWindows < Options
|
||||
def to_s
|
||||
command_line(*generator_args, *to_a)
|
||||
end
|
||||
|
||||
def graphviz_cmake_args
|
||||
generator_args
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def arm?
|
||||
RbConfig::CONFIG["host_cpu"].to_s.downcase.match?(/\A(?:arm64|aarch64)\z/)
|
||||
end
|
||||
|
||||
def cmake_options_output
|
||||
Dir.chdir(__dir__) do
|
||||
IO.popen([@cmake, "-S", "sources", "-B", "build", *generator_args, "-L"]) {|io| io.read}
|
||||
end
|
||||
end
|
||||
|
||||
def generator_args
|
||||
generator = cmake_generator
|
||||
["-G", generator] if generator && !generator.empty?
|
||||
end
|
||||
|
||||
def cmake_generator
|
||||
return @cmake_generator if defined?(@cmake_generator)
|
||||
|
||||
generator = ENV["CMAKE_GENERATOR"]
|
||||
abort "CMAKE_GENERATOR=#{generator} is unsupported for mingw/ucrt Ruby" if visual_studio_generator_name?(generator)
|
||||
return @cmake_generator = generator unless generator.nil? || generator.empty?
|
||||
|
||||
ninja = find_executable("ninja")
|
||||
return @cmake_generator = "Ninja" if ninja
|
||||
|
||||
make = find_executable("make")
|
||||
return @cmake_generator = "MSYS Makefiles" if make
|
||||
|
||||
mingw32_make = find_executable("mingw32-make")
|
||||
return @cmake_generator = "MinGW Makefiles" if mingw32_make
|
||||
|
||||
@cmake_generator = nil
|
||||
end
|
||||
|
||||
def visual_studio_generator_name?(generator)
|
||||
generator && generator.start_with?("Visual Studio")
|
||||
end
|
||||
end
|
||||
|
|
@ -1,17 +1,13 @@
|
|||
#include <ruby.h>
|
||||
#include <ruby/memory_view.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
VALUE mWhisper;
|
||||
VALUE mVAD;
|
||||
VALUE cContext;
|
||||
VALUE cParams;
|
||||
VALUE cVADContext;
|
||||
VALUE cVADParams;
|
||||
VALUE cVADSegments;
|
||||
VALUE cVADSegment;
|
||||
VALUE eError;
|
||||
|
||||
VALUE cSegment;
|
||||
VALUE cToken;
|
||||
VALUE cModel;
|
||||
|
||||
ID id_to_s;
|
||||
|
|
@ -24,27 +20,17 @@ ID id_new;
|
|||
ID id_to_path;
|
||||
ID id_URI;
|
||||
ID id_pre_converted_models;
|
||||
ID id_coreml_compiled_models;
|
||||
ID id_cache;
|
||||
ID id_n_processors;
|
||||
|
||||
static bool is_log_callback_finalized = false;
|
||||
static bool is_ruby_log_callback_present = false;
|
||||
|
||||
// High level API
|
||||
extern VALUE ruby_whisper_segment_allocate(VALUE klass);
|
||||
|
||||
extern VALUE init_ruby_whisper_context(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_context_params(VALUE *cContext);
|
||||
extern void init_ruby_whisper_context(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_params(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_error(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_segment(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_token(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
|
||||
extern void init_ruby_whisper_model(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_vad_params(VALUE *mVAD);
|
||||
extern void init_ruby_whisper_vad_context(VALUE *mVAD);
|
||||
extern void init_ruby_whisper_vad_segment(VALUE *mVAD);
|
||||
extern void init_ruby_whisper_vad_segments(VALUE *mVAD);
|
||||
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
|
||||
|
||||
/*
|
||||
|
|
@ -94,56 +80,19 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
|
|||
return rb_str_new2(str_full);
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* system_info_str -> String
|
||||
*/
|
||||
static VALUE ruby_whisper_s_system_info_str(VALUE self) {
|
||||
return rb_str_new2(whisper_print_system_info());
|
||||
}
|
||||
|
||||
static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
|
||||
is_log_callback_finalized = true;
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int level;
|
||||
const char * buffer;
|
||||
} call_log_callbacks_args;
|
||||
|
||||
static void*
|
||||
call_log_callbacks(void *v_args) {
|
||||
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
||||
if (NIL_P(log_callback)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
call_log_callbacks_args *args = (call_log_callbacks_args *)v_args;
|
||||
VALUE user_data = rb_iv_get(mWhisper, "user_data");
|
||||
rb_funcall(log_callback, id_call, 3, INT2NUM(args->level), rb_str_new2(args->buffer), user_data);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
ruby_whisper_log_callback(enum ggml_log_level level, const char * buffer, void * user_data) {
|
||||
if (is_log_callback_finalized) {
|
||||
return;
|
||||
}
|
||||
if (!is_ruby_log_callback_present) {
|
||||
return;
|
||||
}
|
||||
|
||||
call_log_callbacks_args args = {
|
||||
level,
|
||||
buffer,
|
||||
};
|
||||
if (ruby_thread_has_gvl_p()) {
|
||||
call_log_callbacks((void *)&args);
|
||||
} else {
|
||||
rb_thread_call_with_gvl(call_log_callbacks, (void *)&args);
|
||||
}
|
||||
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
||||
VALUE udata = rb_iv_get(mWhisper, "user_data");
|
||||
rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -159,22 +108,24 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
|
|||
rb_iv_set(self, "log_callback", log_callback);
|
||||
rb_iv_set(self, "user_data", user_data);
|
||||
|
||||
if (!NIL_P(log_callback)) {
|
||||
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
|
||||
rb_define_finalizer(log_callback, finalize_log_callback);
|
||||
}
|
||||
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
|
||||
rb_define_finalizer(log_callback, finalize_log_callback);
|
||||
|
||||
if (NIL_P(log_callback)) {
|
||||
whisper_log_set(NULL, NULL);
|
||||
is_ruby_log_callback_present = false;
|
||||
} else {
|
||||
whisper_log_set(ruby_whisper_log_callback, NULL);
|
||||
is_ruby_log_callback_present = true;
|
||||
}
|
||||
whisper_log_set(ruby_whisper_log_callback, NULL);
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
|
||||
rb_gc_mark(rwm->context);
|
||||
}
|
||||
|
||||
static VALUE ruby_whisper_model_allocate(VALUE klass) {
|
||||
ruby_whisper_model *rwm;
|
||||
rwm = ALLOC(ruby_whisper_model);
|
||||
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
|
||||
}
|
||||
|
||||
void Init_whisper() {
|
||||
id_to_s = rb_intern("to_s");
|
||||
id_call = rb_intern("call");
|
||||
|
|
@ -186,14 +137,9 @@ void Init_whisper() {
|
|||
id_to_path = rb_intern("to_path");
|
||||
id_URI = rb_intern("URI");
|
||||
id_pre_converted_models = rb_intern("pre_converted_models");
|
||||
id_coreml_compiled_models = rb_intern("coreml_compiled_models");
|
||||
id_cache = rb_intern("cache");
|
||||
id_n_processors = rb_intern("n_processors");
|
||||
|
||||
mWhisper = rb_define_module("Whisper");
|
||||
mVAD = rb_define_module_under(mWhisper, "VAD");
|
||||
|
||||
rb_define_const(mWhisper, "VERSION", rb_str_new2(whisper_version()));
|
||||
rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
|
||||
rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
|
||||
rb_define_const(mWhisper, "LOG_LEVEL_WARN", INT2NUM(GGML_LOG_LEVEL_WARN));
|
||||
|
|
@ -201,43 +147,18 @@ void Init_whisper() {
|
|||
rb_define_const(mWhisper, "LOG_LEVEL_DEBUG", INT2NUM(GGML_LOG_LEVEL_DEBUG));
|
||||
rb_define_const(mWhisper, "LOG_LEVEL_CONT", INT2NUM(GGML_LOG_LEVEL_CONT));
|
||||
|
||||
rb_define_const(mWhisper, "AHEADS_NONE", INT2NUM(WHISPER_AHEADS_NONE));
|
||||
rb_define_const(mWhisper, "AHEADS_N_TOP_MOST", INT2NUM(WHISPER_AHEADS_N_TOP_MOST));
|
||||
rb_define_const(mWhisper, "AHEADS_CUSTOM", INT2NUM(WHISPER_AHEADS_CUSTOM));
|
||||
rb_define_const(mWhisper, "AHEADS_TINY_EN", INT2NUM(WHISPER_AHEADS_TINY_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_TINY", INT2NUM(WHISPER_AHEADS_TINY));
|
||||
rb_define_const(mWhisper, "AHEADS_BASE_EN", INT2NUM(WHISPER_AHEADS_BASE_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_BASE", INT2NUM(WHISPER_AHEADS_BASE));
|
||||
rb_define_const(mWhisper, "AHEADS_SMALL_EN", INT2NUM(WHISPER_AHEADS_SMALL_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_SMALL", INT2NUM(WHISPER_AHEADS_SMALL));
|
||||
rb_define_const(mWhisper, "AHEADS_MEDIUM_EN", INT2NUM(WHISPER_AHEADS_MEDIUM_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_MEDIUM", INT2NUM(WHISPER_AHEADS_MEDIUM));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V1", INT2NUM(WHISPER_AHEADS_LARGE_V1));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V2", INT2NUM(WHISPER_AHEADS_LARGE_V2));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V3", INT2NUM(WHISPER_AHEADS_LARGE_V3));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V3_TURBO", INT2NUM(WHISPER_AHEADS_LARGE_V3_TURBO));
|
||||
|
||||
rb_define_singleton_method(mWhisper, "lang_max_id", ruby_whisper_s_lang_max_id, 0);
|
||||
rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
|
||||
rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
|
||||
rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
|
||||
rb_define_singleton_method(mWhisper, "system_info_str", ruby_whisper_s_system_info_str, 0);
|
||||
rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
|
||||
rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);
|
||||
|
||||
cContext = init_ruby_whisper_context(&mWhisper);
|
||||
init_ruby_whisper_context_params(&cContext);
|
||||
init_ruby_whisper_context(&mWhisper);
|
||||
init_ruby_whisper_params(&mWhisper);
|
||||
init_ruby_whisper_error(&mWhisper);
|
||||
init_ruby_whisper_segment(&mWhisper);
|
||||
init_ruby_whisper_token(&mWhisper);
|
||||
init_ruby_whisper_segment(&mWhisper, &cContext);
|
||||
init_ruby_whisper_model(&mWhisper);
|
||||
init_ruby_whisper_vad_params(&mVAD);
|
||||
init_ruby_whisper_vad_segment(&mVAD);
|
||||
init_ruby_whisper_vad_segments(&mVAD);
|
||||
init_ruby_whisper_vad_context(&mVAD);
|
||||
|
||||
rb_require("whisper/context");
|
||||
rb_require("whisper/segment");
|
||||
rb_require("whisper/model/uri");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,8 @@
|
|||
#ifndef RUBY_WHISPER_H
|
||||
#define RUBY_WHISPER_H
|
||||
|
||||
#include <ruby.h>
|
||||
#include <ruby/version.h>
|
||||
#include <ruby/util.h>
|
||||
#include <ruby/thread.h>
|
||||
#include <ruby/memory_view.h>
|
||||
#include "whisper.h"
|
||||
|
||||
#if RUBY_API_VERSION_MAJOR < 4
|
||||
// Exists but not declared as public API
|
||||
int ruby_thread_has_gvl_p(void);
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
VALUE *context;
|
||||
VALUE user_data;
|
||||
|
|
@ -20,104 +10,25 @@ typedef struct {
|
|||
VALUE callbacks;
|
||||
} ruby_whisper_callback_container;
|
||||
|
||||
typedef struct {
|
||||
VALUE *context;
|
||||
VALUE user_data;
|
||||
VALUE callback;
|
||||
VALUE callbacks;
|
||||
bool is_interrupted;
|
||||
} ruby_whisper_abort_callback_container;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_context *context;
|
||||
} ruby_whisper;
|
||||
|
||||
typedef struct ruby_whisper_context_params {
|
||||
struct whisper_context_params params;
|
||||
} ruby_whisper_context_params;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_full_params params;
|
||||
bool diarize;
|
||||
ruby_whisper_callback_container *new_segment_callback_container;
|
||||
ruby_whisper_callback_container *progress_callback_container;
|
||||
ruby_whisper_callback_container *encoder_begin_callback_container;
|
||||
ruby_whisper_abort_callback_container *abort_callback_container;
|
||||
VALUE vad_params;
|
||||
ruby_whisper_callback_container *abort_callback_container;
|
||||
} ruby_whisper_params;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_vad_params params;
|
||||
} ruby_whisper_vad_params;
|
||||
|
||||
typedef struct {
|
||||
VALUE context;
|
||||
int index;
|
||||
} ruby_whisper_segment;
|
||||
|
||||
typedef struct {
|
||||
whisper_token_data *token_data;
|
||||
VALUE text;
|
||||
} ruby_whisper_token;
|
||||
|
||||
typedef struct {
|
||||
VALUE context;
|
||||
} ruby_whisper_model;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_vad_segments *segments;
|
||||
} ruby_whisper_vad_segments;
|
||||
|
||||
typedef struct {
|
||||
VALUE segments;
|
||||
int index;
|
||||
} ruby_whisper_vad_segment;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_vad_context *context;
|
||||
} ruby_whisper_vad_context;
|
||||
|
||||
typedef struct parsed_samples_t {
|
||||
float *samples;
|
||||
int n_samples;
|
||||
rb_memory_view_t memview;
|
||||
bool memview_exported;
|
||||
} parsed_samples_t;
|
||||
|
||||
#define GetContext(obj, rw) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper, &ruby_whisper_type, (rw)); \
|
||||
if ((rw)->context == NULL) { \
|
||||
rb_raise(rb_eRuntimeError, "Not initialized"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define GetContextParams(obj, rwcp) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_context_params, &ruby_whisper_context_params_type, (rwcp)); \
|
||||
} while (0)
|
||||
|
||||
#define GetToken(obj, rwt) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_token, &ruby_whisper_token_type, (rwt)); \
|
||||
if ((rwt)->token_data == NULL) { \
|
||||
rb_raise(rb_eRuntimeError, "Not initialized"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define GetVADContext(obj, rwvc) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_vad_context, &ruby_whisper_vad_context_type, (rwvc)); \
|
||||
if ((rwvc)->context == NULL) { \
|
||||
rb_raise(rb_eRuntimeError, "Not initialized"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define GetVADParams(obj, rwvp) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_vad_params, &ruby_whisper_vad_params_type, (rwvp)); \
|
||||
} while (0)
|
||||
|
||||
#define GetVADSegments(obj, rwvss) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, (rwvss)); \
|
||||
if ((rwvss)->segments == NULL) { \
|
||||
rb_raise(rb_eRuntimeError, "Not initialized"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,11 +1,7 @@
|
|||
#include <ruby.h>
|
||||
#include <ruby/memory_view.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
#define IS_BIGENDIAN true
|
||||
#else
|
||||
#define IS_BIGENDIAN false
|
||||
#endif
|
||||
|
||||
extern ID id_to_s;
|
||||
extern ID id___method__;
|
||||
extern ID id_to_enum;
|
||||
|
|
@ -15,64 +11,15 @@ extern ID id_new;
|
|||
extern ID id_to_path;
|
||||
extern ID id_URI;
|
||||
extern ID id_pre_converted_models;
|
||||
extern ID id_coreml_compiled_models;
|
||||
extern ID id_cache;
|
||||
extern ID id_n_processors;
|
||||
|
||||
extern VALUE cContext;
|
||||
extern VALUE eError;
|
||||
extern VALUE cModel;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_params_type;
|
||||
extern const rb_data_type_t ruby_whisper_context_params_type;
|
||||
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
|
||||
extern VALUE rb_whisper_model_s_new(VALUE context);
|
||||
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
||||
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context, int n_processors);
|
||||
|
||||
ID transcribe_option_names[1];
|
||||
|
||||
typedef struct fill_samples_args {
|
||||
float *dest;
|
||||
VALUE *src;
|
||||
int n_samples;
|
||||
} fill_samples_args;
|
||||
|
||||
typedef struct full_args {
|
||||
VALUE *context;
|
||||
VALUE *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
} full_args;
|
||||
|
||||
typedef struct full_parallel_args {
|
||||
VALUE *context;
|
||||
VALUE *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
int n_processors;
|
||||
} full_parallel_args;
|
||||
|
||||
typedef struct full_without_gvl_args {
|
||||
struct whisper_context *context;
|
||||
struct whisper_full_params *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
int result;
|
||||
} full_without_gvl_args;
|
||||
|
||||
typedef struct full_parallel_without_gvl_args {
|
||||
struct whisper_context *context;
|
||||
struct whisper_full_params *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
int n_processors;
|
||||
int result;
|
||||
} full_parallel_without_gvl_args;
|
||||
|
||||
typedef struct full_ubf_args {
|
||||
ruby_whisper_abort_callback_container *abort_callback_container;
|
||||
} full_ubf_args;
|
||||
extern VALUE rb_whisper_model_initialize(VALUE context);
|
||||
extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
|
||||
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
|
||||
|
||||
static void
|
||||
ruby_whisper_free(ruby_whisper *rw)
|
||||
|
|
@ -90,74 +37,19 @@ rb_whisper_mark(ruby_whisper *rw)
|
|||
}
|
||||
|
||||
void
|
||||
rb_whisper_free(void *p)
|
||||
rb_whisper_free(ruby_whisper *rw)
|
||||
{
|
||||
ruby_whisper *rw = (ruby_whisper *)p;
|
||||
ruby_whisper_free(rw);
|
||||
free(rw);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper *rw = (const ruby_whisper *)p;
|
||||
size_t size = sizeof(*rw);
|
||||
if (!rw) {
|
||||
return 0;
|
||||
}
|
||||
if (rw->context) {
|
||||
size += sizeof(rw->context);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
const rb_data_type_t ruby_whisper_type = {
|
||||
"ruby_whisper",
|
||||
{0, rb_whisper_free, ruby_whisper_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper, &ruby_whisper_type, rw);
|
||||
rw = ALLOC(ruby_whisper);
|
||||
rw->context = NULL;
|
||||
return obj;
|
||||
}
|
||||
|
||||
VALUE
|
||||
ruby_whisper_normalize_model_path(VALUE model_path)
|
||||
{
|
||||
VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
|
||||
VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
|
||||
if (!NIL_P(pre_converted_model)) {
|
||||
model_path = pre_converted_model;
|
||||
#ifdef RUBY_WHISPER_USE_COREML
|
||||
VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0);
|
||||
VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model);
|
||||
if (!NIL_P(coreml_converted_model)) {
|
||||
rb_funcall(coreml_converted_model, id_cache, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (TYPE(model_path) == T_STRING) {
|
||||
const char * model_path_str = StringValueCStr(model_path);
|
||||
if (strncmp("http://", model_path_str, 7) == 0 || strncmp("https://", model_path_str, 8) == 0) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
model_path = rb_class_new_instance(1, &model_path, uri_class);
|
||||
}
|
||||
}
|
||||
else if (rb_obj_is_kind_of(model_path, rb_path2class("URI::HTTP"))) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
model_path = rb_class_new_instance(1, &model_path, uri_class);
|
||||
}
|
||||
if (rb_respond_to(model_path, id_to_path)) {
|
||||
model_path = rb_funcall(model_path, id_to_path, 0);
|
||||
}
|
||||
|
||||
return model_path;
|
||||
return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -171,25 +63,34 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
|
|||
{
|
||||
ruby_whisper *rw;
|
||||
VALUE whisper_model_file_path;
|
||||
VALUE context_params;
|
||||
struct whisper_context_params params;
|
||||
|
||||
// TODO: we can support init from buffer here too maybe another ruby object to expose
|
||||
rb_scan_args(argc, argv, "11", &whisper_model_file_path, &context_params);
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
rb_scan_args(argc, argv, "01", &whisper_model_file_path);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
|
||||
whisper_model_file_path = ruby_whisper_normalize_model_path(whisper_model_file_path);
|
||||
VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
|
||||
VALUE pre_converted_model = rb_hash_aref(pre_converted_models, whisper_model_file_path);
|
||||
if (!NIL_P(pre_converted_model)) {
|
||||
whisper_model_file_path = pre_converted_model;
|
||||
}
|
||||
if (TYPE(whisper_model_file_path) == T_STRING) {
|
||||
const char * whisper_model_file_path_str = StringValueCStr(whisper_model_file_path);
|
||||
if (strncmp("http://", whisper_model_file_path_str, 7) == 0 || strncmp("https://", whisper_model_file_path_str, 8) == 0) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
|
||||
}
|
||||
}
|
||||
if (rb_obj_is_kind_of(whisper_model_file_path, rb_path2class("URI::HTTP"))) {
|
||||
VALUE uri_class = rb_const_get(cModel, id_URI);
|
||||
whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
|
||||
}
|
||||
if (rb_respond_to(whisper_model_file_path, id_to_path)) {
|
||||
whisper_model_file_path = rb_funcall(whisper_model_file_path, id_to_path, 0);
|
||||
}
|
||||
if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
|
||||
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
|
||||
}
|
||||
if (NIL_P(context_params)) {
|
||||
params = whisper_context_default_params();
|
||||
} else {
|
||||
ruby_whisper_context_params *rwcp;
|
||||
GetContextParams(context_params, rwcp);
|
||||
params = rwcp->params;
|
||||
}
|
||||
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), params);
|
||||
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
|
||||
if (rw->context == NULL) {
|
||||
rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
|
||||
}
|
||||
|
|
@ -203,7 +104,7 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
|
|||
VALUE ruby_whisper_model_n_vocab(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_vocab(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -214,7 +115,7 @@ VALUE ruby_whisper_model_n_vocab(VALUE self)
|
|||
VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -225,7 +126,7 @@ VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
|
|||
VALUE ruby_whisper_model_n_audio_state(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_state(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -236,7 +137,7 @@ VALUE ruby_whisper_model_n_audio_state(VALUE self)
|
|||
VALUE ruby_whisper_model_n_audio_head(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_head(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -247,7 +148,7 @@ VALUE ruby_whisper_model_n_audio_head(VALUE self)
|
|||
VALUE ruby_whisper_model_n_audio_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_layer(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -258,7 +159,7 @@ VALUE ruby_whisper_model_n_audio_layer(VALUE self)
|
|||
VALUE ruby_whisper_model_n_text_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_ctx(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -269,7 +170,7 @@ VALUE ruby_whisper_model_n_text_ctx(VALUE self)
|
|||
VALUE ruby_whisper_model_n_text_state(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_state(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -280,7 +181,7 @@ VALUE ruby_whisper_model_n_text_state(VALUE self)
|
|||
VALUE ruby_whisper_model_n_text_head(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_head(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -291,7 +192,7 @@ VALUE ruby_whisper_model_n_text_head(VALUE self)
|
|||
VALUE ruby_whisper_model_n_text_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_layer(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -302,7 +203,7 @@ VALUE ruby_whisper_model_n_text_layer(VALUE self)
|
|||
VALUE ruby_whisper_model_n_mels(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_mels(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -313,7 +214,7 @@ VALUE ruby_whisper_model_n_mels(VALUE self)
|
|||
VALUE ruby_whisper_model_ftype(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_ftype(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -324,191 +225,10 @@ VALUE ruby_whisper_model_ftype(VALUE self)
|
|||
VALUE ruby_whisper_model_type(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return rb_str_new2(whisper_model_type_readable(rw->context));
|
||||
}
|
||||
|
||||
static bool
|
||||
check_memory_view(rb_memory_view_t *memview)
|
||||
{
|
||||
if (!memview->format) {
|
||||
rb_warn("currently format is required");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (strcmp(memview->format, "f") == 0) {
|
||||
// accept
|
||||
} else if (strcmp(memview->format, "e") == 0) {
|
||||
if (IS_BIGENDIAN) {
|
||||
rb_warn("currently format \"e\" is only supported on little-endian environment");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
rb_warn("currently only format \"f\" and \"e\" on little-endian environment is supported for MemoryView, but given: %s", memview->format);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (memview->ndim != 1 && !(memview->ndim == 2 && memview->shape[1] == 1)) {
|
||||
// TODO: Accept ndim == 2 with shape [n_samples, channels] and channels > 1 by averaging the samples in different channels or just taking the first channel
|
||||
rb_warn("currently only 1 dimensional MemoryView is supported, but given: %zd", memview->ndim);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
fill_samples(VALUE rb_args)
|
||||
{
|
||||
fill_samples_args *args = (fill_samples_args *)rb_args;
|
||||
|
||||
if (RB_TYPE_P(*args->src, T_ARRAY)) {
|
||||
for (int i = 0; i < args->n_samples; i++) {
|
||||
args->dest[i] = RFLOAT_VALUE(rb_ary_entry(*args->src, i));
|
||||
}
|
||||
} else {
|
||||
// TODO: use rb_block_call
|
||||
VALUE iter = rb_funcall(*args->src, id_to_enum, 1, rb_str_new2("each"));
|
||||
for (int i = 0; i < args->n_samples; i++) {
|
||||
// TODO: check if iter is exhausted and raise ArgumentError appropriately
|
||||
VALUE sample = rb_funcall(iter, id_next, 0);
|
||||
args->dest[i] = RFLOAT_VALUE(sample);
|
||||
}
|
||||
}
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
struct parsed_samples_t
|
||||
parse_samples(VALUE *samples, VALUE *n_samples)
|
||||
{
|
||||
bool memview_available = rb_memory_view_available_p(*samples);
|
||||
struct parsed_samples_t parsed = {0};
|
||||
parsed.memview_exported = false;
|
||||
const bool is_array = RB_TYPE_P(*samples, T_ARRAY);
|
||||
|
||||
if (!NIL_P(*n_samples)) {
|
||||
parsed.n_samples = NUM2INT(*n_samples);
|
||||
if (is_array) {
|
||||
if (RARRAY_LEN(*samples) < parsed.n_samples) {
|
||||
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(*samples), parsed.n_samples);
|
||||
}
|
||||
}
|
||||
// Should check when samples.respond_to?(:length)?
|
||||
} else {
|
||||
if (is_array) {
|
||||
if (RARRAY_LEN(*samples) > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
parsed.n_samples = (int)RARRAY_LEN(*samples);
|
||||
} else if (memview_available) {
|
||||
bool memview_got = rb_memory_view_get(*samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE);
|
||||
if (memview_got) {
|
||||
parsed.memview_exported = check_memory_view(&parsed.memview);
|
||||
if (!parsed.memview_exported) {
|
||||
rb_memory_view_release(&parsed.memview);
|
||||
parsed.memview = (rb_memory_view_t){0};
|
||||
}
|
||||
}
|
||||
if (parsed.memview_exported) {
|
||||
ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size;
|
||||
if (n_samples_size > INT_MAX) {
|
||||
rb_memory_view_release(&parsed.memview);
|
||||
rb_raise(rb_eArgError, "samples are too long: %zd", n_samples_size);
|
||||
}
|
||||
parsed.n_samples = (int)n_samples_size;
|
||||
} else {
|
||||
rb_warn("unable to get a memory view. falls back to Ruby object");
|
||||
if (rb_respond_to(*samples, id_length)) {
|
||||
parsed.n_samples = NUM2INT(rb_funcall(*samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length");
|
||||
}
|
||||
}
|
||||
} else if (rb_respond_to(*samples, id_length)) {
|
||||
parsed.n_samples = NUM2INT(rb_funcall(*samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of float when n_samples is not given");
|
||||
}
|
||||
}
|
||||
|
||||
if (parsed.memview_exported) {
|
||||
parsed.samples = (float *)parsed.memview.data;
|
||||
} else {
|
||||
parsed.samples = ALLOC_N(float, parsed.n_samples);
|
||||
fill_samples_args args = {
|
||||
parsed.samples,
|
||||
samples,
|
||||
parsed.n_samples,
|
||||
};
|
||||
int state;
|
||||
rb_protect(fill_samples, (VALUE)&args, &state);
|
||||
if (state) {
|
||||
xfree(parsed.samples);
|
||||
rb_jump_tag(state);
|
||||
}
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
VALUE
|
||||
release_samples(VALUE rb_parsed_args)
|
||||
{
|
||||
parsed_samples_t *parsed_args = (parsed_samples_t *)rb_parsed_args;
|
||||
|
||||
if (parsed_args->memview_exported) {
|
||||
rb_memory_view_release(&parsed_args->memview);
|
||||
} else {
|
||||
xfree(parsed_args->samples);
|
||||
}
|
||||
*parsed_args = (parsed_samples_t){0};
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
static void*
|
||||
full_without_gvl(void *rb_args)
|
||||
{
|
||||
full_without_gvl_args *args = (full_without_gvl_args *)rb_args;
|
||||
args->result = whisper_full(args->context, *args->params, args->samples, args->n_samples);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
full_ubf(void *rb_args)
|
||||
{
|
||||
full_ubf_args *args = (full_ubf_args *)rb_args;
|
||||
|
||||
args->abort_callback_container->is_interrupted = true;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
full_body(VALUE rb_args)
|
||||
{
|
||||
full_args *args = (full_args *)rb_args;
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
GetContext(*args->context, rw);
|
||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
|
||||
prepare_transcription(rwp, args->context, 1);
|
||||
|
||||
struct full_without_gvl_args full_without_gvl_args = {
|
||||
rw->context,
|
||||
&rwp->params,
|
||||
args->samples,
|
||||
args->n_samples,
|
||||
0,
|
||||
};
|
||||
full_ubf_args full_ubf_args = {
|
||||
rwp->abort_callback_container,
|
||||
};
|
||||
rb_thread_call_without_gvl(full_without_gvl, (void *)&full_without_gvl_args, full_ubf, (void *)&full_ubf_args);
|
||||
return INT2NUM(full_without_gvl_args.result);
|
||||
}
|
||||
|
||||
/*
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
* Not thread safe for same context
|
||||
|
|
@ -526,17 +246,58 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
|||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
||||
}
|
||||
|
||||
VALUE n_samples = argc == 2 ? Qnil : argv[2];
|
||||
|
||||
struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples);
|
||||
full_args args = {
|
||||
&self,
|
||||
&argv[0],
|
||||
parsed.samples,
|
||||
parsed.n_samples,
|
||||
};
|
||||
VALUE rb_result = rb_ensure(full_body, (VALUE)&args, release_samples, (VALUE)&parsed);
|
||||
const int result = NUM2INT(rb_result);
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
VALUE params = argv[0];
|
||||
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
||||
VALUE samples = argv[1];
|
||||
int n_samples;
|
||||
rb_memory_view_t view;
|
||||
const bool memory_view_available_p = rb_memory_view_available_p(samples);
|
||||
if (argc == 3) {
|
||||
n_samples = NUM2INT(argv[2]);
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) < n_samples) {
|
||||
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
|
||||
}
|
||||
}
|
||||
// Should check when samples.respond_to?(:length)?
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
n_samples = RARRAY_LEN(samples);
|
||||
} else if (memory_view_available_p) {
|
||||
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
||||
view.obj = Qnil;
|
||||
rb_raise(rb_eArgError, "unable to get a memory view");
|
||||
}
|
||||
n_samples = view.byte_size / view.item_size;
|
||||
} else if (rb_respond_to(samples, id_length)) {
|
||||
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
|
||||
}
|
||||
}
|
||||
float * c_samples = (float *)malloc(n_samples * sizeof(float));
|
||||
if (memory_view_available_p) {
|
||||
c_samples = (float *)view.data;
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
|
||||
}
|
||||
} else {
|
||||
// TODO: use rb_block_call
|
||||
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
// TODO: check if iter is exhausted and raise ArgumentError appropriately
|
||||
VALUE sample = rb_funcall(iter, id_next, 0);
|
||||
c_samples[i] = RFLOAT_VALUE(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
register_callbacks(rwp, &self);
|
||||
const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
|
||||
if (0 == result) {
|
||||
return self;
|
||||
} else {
|
||||
|
|
@ -544,41 +305,6 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
|||
}
|
||||
}
|
||||
|
||||
static void*
|
||||
full_parallel_without_gvl(void *rb_args)
|
||||
{
|
||||
full_parallel_without_gvl_args *args = (full_parallel_without_gvl_args *)rb_args;
|
||||
args->result = whisper_full_parallel(args->context, *args->params, args->samples, args->n_samples, args->n_processors);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
full_parallel_body(VALUE rb_args)
|
||||
{
|
||||
full_parallel_args *args = (full_parallel_args *)rb_args;
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
GetContext(*args->context, rw);
|
||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
|
||||
prepare_transcription(rwp, args->context, args->n_processors);
|
||||
|
||||
struct full_parallel_without_gvl_args full_parallel_without_gvl_args = {
|
||||
rw->context,
|
||||
&rwp->params,
|
||||
args->samples,
|
||||
args->n_samples,
|
||||
args->n_processors,
|
||||
0,
|
||||
};
|
||||
full_ubf_args full_ubf_args = {
|
||||
rwp->abort_callback_container,
|
||||
};
|
||||
rb_thread_call_without_gvl(full_parallel_without_gvl, (void *)&full_parallel_without_gvl_args, full_ubf, (void *)&full_ubf_args);
|
||||
return INT2NUM(full_parallel_without_gvl_args.result);
|
||||
}
|
||||
|
||||
/*
|
||||
* Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
||||
* Result is stored in the default state of the context
|
||||
|
|
@ -596,11 +322,19 @@ static VALUE
|
|||
ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
||||
{
|
||||
if (argc < 2 || argc > 4) {
|
||||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..4)", argc);
|
||||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
||||
}
|
||||
|
||||
VALUE n_samples = argc == 2 ? Qnil : argv[2];
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
VALUE params = argv[0];
|
||||
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
||||
VALUE samples = argv[1];
|
||||
int n_samples;
|
||||
int n_processors;
|
||||
rb_memory_view_t view;
|
||||
const bool memory_view_available_p = rb_memory_view_available_p(samples);
|
||||
switch (argc) {
|
||||
case 2:
|
||||
n_processors = 1;
|
||||
|
|
@ -612,16 +346,49 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
|||
n_processors = NUM2INT(argv[3]);
|
||||
break;
|
||||
}
|
||||
struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples);
|
||||
const full_parallel_args args = {
|
||||
&self,
|
||||
&argv[0],
|
||||
parsed.samples,
|
||||
parsed.n_samples,
|
||||
n_processors,
|
||||
};
|
||||
const VALUE rb_result = rb_ensure(full_parallel_body, (VALUE)&args, release_samples, (VALUE)&parsed);
|
||||
const int result = NUM2INT(rb_result);
|
||||
if (argc >= 3 && !NIL_P(argv[2])) {
|
||||
n_samples = NUM2INT(argv[2]);
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) < n_samples) {
|
||||
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
|
||||
}
|
||||
}
|
||||
// Should check when samples.respond_to?(:length)?
|
||||
} else if (memory_view_available_p) {
|
||||
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
||||
view.obj = Qnil;
|
||||
rb_raise(rb_eArgError, "unable to get a memory view");
|
||||
}
|
||||
n_samples = view.byte_size / view.item_size;
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
n_samples = RARRAY_LEN(samples);
|
||||
} else if (rb_respond_to(samples, id_length)) {
|
||||
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
|
||||
}
|
||||
}
|
||||
float * c_samples = (float *)malloc(n_samples * sizeof(float));
|
||||
if (memory_view_available_p) {
|
||||
c_samples = (float *)view.data;
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
|
||||
}
|
||||
} else {
|
||||
// FIXME: use rb_block_call
|
||||
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
// TODO: check if iter is exhausted and raise ArgumentError
|
||||
VALUE sample = rb_funcall(iter, id_next, 0);
|
||||
c_samples[i] = RFLOAT_VALUE(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
register_callbacks(rwp, &self);
|
||||
const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
|
||||
if (0 == result) {
|
||||
return self;
|
||||
} else {
|
||||
|
|
@ -639,7 +406,7 @@ static VALUE
|
|||
ruby_whisper_full_n_segments(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_full_n_segments(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -653,7 +420,7 @@ static VALUE
|
|||
ruby_whisper_full_lang_id(VALUE self)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_full_lang_id(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -678,10 +445,10 @@ static VALUE
|
|||
ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
|
||||
return LONG2NUM(t0);
|
||||
return INT2NUM(t0);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -696,10 +463,10 @@ static VALUE
|
|||
ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
|
||||
return LONG2NUM(t1);
|
||||
return INT2NUM(t1);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -714,7 +481,7 @@ static VALUE
|
|||
ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
|
||||
return speaker_turn_next ? Qtrue : Qfalse;
|
||||
|
|
@ -732,7 +499,7 @@ static VALUE
|
|||
ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
|
||||
return rb_str_new2(text);
|
||||
|
|
@ -746,7 +513,7 @@ static VALUE
|
|||
ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
|
||||
{
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
||||
const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
|
||||
return DBL2NUM(no_speech_prob);
|
||||
|
|
@ -757,7 +524,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
|
|||
static VALUE
|
||||
ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
|
||||
{
|
||||
return rb_whisper_segment_s_new(self, NUM2INT(i_segment));
|
||||
return rb_whisper_segment_initialize(self, NUM2INT(i_segment));
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -787,11 +554,11 @@ ruby_whisper_each_segment(VALUE self)
|
|||
}
|
||||
|
||||
ruby_whisper *rw;
|
||||
GetContext(self, rw);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
|
||||
const int n_segments = whisper_full_n_segments(rw->context);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
rb_yield(rb_whisper_segment_s_new(self, i));
|
||||
rb_yield(rb_whisper_segment_initialize(self, i));
|
||||
}
|
||||
|
||||
return self;
|
||||
|
|
@ -804,16 +571,14 @@ ruby_whisper_each_segment(VALUE self)
|
|||
static VALUE
|
||||
ruby_whisper_get_model(VALUE self)
|
||||
{
|
||||
return rb_whisper_model_s_new(self);
|
||||
return rb_whisper_model_initialize(self);
|
||||
}
|
||||
|
||||
VALUE
|
||||
void
|
||||
init_ruby_whisper_context(VALUE *mWhisper)
|
||||
{
|
||||
cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
|
||||
|
||||
transcribe_option_names[0] = id_n_processors;
|
||||
|
||||
rb_define_alloc_func(cContext, ruby_whisper_allocate);
|
||||
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
|
||||
|
||||
|
|
@ -840,11 +605,9 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
|||
rb_define_method(cContext, "full", ruby_whisper_full, -1);
|
||||
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
|
||||
|
||||
// High level
|
||||
// High leve
|
||||
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
|
||||
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
||||
|
||||
rb_define_method(cContext, "model", ruby_whisper_get_model, 0);
|
||||
|
||||
return cContext;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,163 +0,0 @@
|
|||
#include "ruby_whisper.h"
|
||||
|
||||
#define NUM_PARAMS 6
|
||||
|
||||
#define DEF_BOOLEAN_ATTR_METHOD(name) \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_get_ ## name(VALUE self) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
return rwcp->params.name ? Qtrue : Qfalse; \
|
||||
} \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_set_ ## name(VALUE self, VALUE value) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
rwcp->params.name = RTEST(value); \
|
||||
return value; \
|
||||
}
|
||||
|
||||
#define DEF_INT_ATTR_METHOD(name) \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_get_ ## name(VALUE self) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
return INT2NUM(rwcp->params.name); \
|
||||
} \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_set_ ## name(VALUE self, VALUE value) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
rwcp->params.name = NUM2INT(value); \
|
||||
return value; \
|
||||
}
|
||||
|
||||
#define DEFINE_PARAM(param_name, nth) \
|
||||
id_ ## param_name = rb_intern(#param_name); \
|
||||
param_names[nth] = id_ ## param_name; \
|
||||
rb_define_method(cContextParams, #param_name, ruby_whisper_context_params_get_ ## param_name, 0); \
|
||||
rb_define_method(cContextParams, #param_name "=", ruby_whisper_context_params_set_ ## param_name, 1);
|
||||
|
||||
VALUE cContextParams;
|
||||
|
||||
static ID param_names[NUM_PARAMS];
|
||||
static ID id_use_gpu;
|
||||
static ID id_flash_attn;
|
||||
static ID id_gpu_device;
|
||||
static ID id_dtw_token_timestamps;
|
||||
static ID id_dtw_aheads_preset;
|
||||
static ID id_dtw_n_top;
|
||||
|
||||
static size_t
|
||||
ruby_whisper_context_params_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_context_params *rwcp = (ruby_whisper_context_params *)p;
|
||||
if (!rwcp) {
|
||||
return 0;
|
||||
}
|
||||
return sizeof(ruby_whisper_context_params);
|
||||
}
|
||||
|
||||
const rb_data_type_t ruby_whisper_context_params_type = {
|
||||
"ruby_whisper_context_params",
|
||||
{0, RUBY_DEFAULT_FREE, ruby_whisper_context_params_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_s_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_context_params *rwcp;
|
||||
return TypedData_Make_Struct(klass, ruby_whisper_context_params, &ruby_whisper_context_params_type, rwcp);
|
||||
}
|
||||
|
||||
DEF_BOOLEAN_ATTR_METHOD(use_gpu);
|
||||
DEF_BOOLEAN_ATTR_METHOD(flash_attn);
|
||||
DEF_INT_ATTR_METHOD(gpu_device);
|
||||
DEF_BOOLEAN_ATTR_METHOD(dtw_token_timestamps);
|
||||
DEF_INT_ATTR_METHOD(dtw_aheads_preset);
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_get_dtw_n_top(VALUE self) {
|
||||
ruby_whisper_context_params *rwcp;
|
||||
GetContextParams(self, rwcp);
|
||||
|
||||
int dtw_n_top = rwcp->params.dtw_n_top;
|
||||
|
||||
return dtw_n_top == -1 ? Qnil : INT2NUM(dtw_n_top);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_set_dtw_n_top(VALUE self, VALUE value) {
|
||||
ruby_whisper_context_params *rwcp;
|
||||
GetContextParams(self, rwcp);
|
||||
|
||||
rwcp->params.dtw_n_top = NIL_P(value) ? -1 : NUM2INT(value);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
#define SET_PARAM_IF_SAME(param_name) \
|
||||
if (id == id_ ## param_name) { \
|
||||
ruby_whisper_context_params_set_ ## param_name(self, value); \
|
||||
continue; \
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
{
|
||||
ruby_whisper_context_params *rwcp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_context_params, &ruby_whisper_context_params_type, rwcp);
|
||||
rwcp->params = whisper_context_default_params();
|
||||
|
||||
VALUE kw_hash;
|
||||
rb_scan_args_kw(RB_SCAN_ARGS_KEYWORDS, argc, argv, ":", &kw_hash);
|
||||
if (NIL_P(kw_hash)) {
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
VALUE values[NUM_PARAMS] = {Qundef};
|
||||
rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
|
||||
|
||||
ID id;
|
||||
VALUE value;
|
||||
for (int i = 0; i < NUM_PARAMS; i++) {
|
||||
id = param_names[i];
|
||||
value = values[i];
|
||||
if (value == Qundef) {
|
||||
continue;
|
||||
}
|
||||
SET_PARAM_IF_SAME(use_gpu)
|
||||
SET_PARAM_IF_SAME(flash_attn)
|
||||
SET_PARAM_IF_SAME(gpu_device)
|
||||
SET_PARAM_IF_SAME(dtw_token_timestamps)
|
||||
SET_PARAM_IF_SAME(dtw_aheads_preset)
|
||||
SET_PARAM_IF_SAME(dtw_n_top)
|
||||
}
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
#undef SET_PARAM_IF_SAME
|
||||
|
||||
void
|
||||
init_ruby_whisper_context_params(VALUE *cContext)
|
||||
{
|
||||
cContextParams = rb_define_class_under(*cContext, "Params", rb_cObject);
|
||||
|
||||
rb_define_alloc_func(cContextParams, ruby_whisper_context_params_s_allocate);
|
||||
rb_define_method(cContextParams, "initialize", ruby_whisper_context_params_initialize, -1);
|
||||
|
||||
DEFINE_PARAM(use_gpu, 0)
|
||||
DEFINE_PARAM(flash_attn, 1)
|
||||
DEFINE_PARAM(gpu_device, 2)
|
||||
DEFINE_PARAM(dtw_token_timestamps, 3)
|
||||
DEFINE_PARAM(dtw_aheads_preset, 4)
|
||||
DEFINE_PARAM(dtw_n_top, 5)
|
||||
}
|
||||
|
||||
#undef DEFINE_PARAM
|
||||
#undef DEF_INT_ATTR_METHOD
|
||||
#undef DEF_BOOLEAN_ATTR_METHOD
|
||||
#undef NUM_PARAMS
|
||||
|
|
@ -1,43 +1,22 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
|
||||
extern VALUE cModel;
|
||||
|
||||
static void rb_whisper_model_mark(void *p) {
|
||||
ruby_whisper_model *rwm = (ruby_whisper_model *)p;
|
||||
if (rwm->context) {
|
||||
rb_gc_mark(rwm->context);
|
||||
}
|
||||
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
|
||||
rb_gc_mark(rwm->context);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_model_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_model *rwm = (const ruby_whisper_model *)p;
|
||||
size_t size = sizeof(rwm);
|
||||
if (!rwm) {
|
||||
return 0;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static const rb_data_type_t rb_whisper_model_type = {
|
||||
"ruby_whisper_model",
|
||||
{rb_whisper_model_mark, RUBY_DEFAULT_FREE, ruby_whisper_model_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE ruby_whisper_model_allocate(VALUE klass) {
|
||||
ruby_whisper_model *rwm;
|
||||
return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
rwm = ALLOC(ruby_whisper_model);
|
||||
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
|
||||
}
|
||||
|
||||
VALUE rb_whisper_model_s_new(VALUE context) {
|
||||
VALUE rb_whisper_model_initialize(VALUE context) {
|
||||
ruby_whisper_model *rwm;
|
||||
const VALUE model = ruby_whisper_model_allocate(cModel);
|
||||
TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(model, ruby_whisper_model, rwm);
|
||||
rwm->context = context;
|
||||
return model;
|
||||
};
|
||||
|
|
@ -50,9 +29,9 @@ static VALUE
|
|||
ruby_whisper_model_n_vocab(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_vocab(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -64,9 +43,9 @@ static VALUE
|
|||
ruby_whisper_model_n_audio_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -78,9 +57,9 @@ static VALUE
|
|||
ruby_whisper_model_n_audio_state(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_state(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -92,9 +71,9 @@ static VALUE
|
|||
ruby_whisper_model_n_audio_head(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_head(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -106,9 +85,9 @@ static VALUE
|
|||
ruby_whisper_model_n_audio_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_audio_layer(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -120,9 +99,9 @@ static VALUE
|
|||
ruby_whisper_model_n_text_ctx(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_ctx(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -134,9 +113,9 @@ static VALUE
|
|||
ruby_whisper_model_n_text_state(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_state(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -148,9 +127,9 @@ static VALUE
|
|||
ruby_whisper_model_n_text_head(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_head(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -162,9 +141,9 @@ static VALUE
|
|||
ruby_whisper_model_n_text_layer(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_text_layer(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -176,9 +155,9 @@ static VALUE
|
|||
ruby_whisper_model_n_mels(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_n_mels(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -190,9 +169,9 @@ static VALUE
|
|||
ruby_whisper_model_ftype(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return INT2NUM(whisper_model_ftype(rw->context));
|
||||
}
|
||||
|
||||
|
|
@ -204,9 +183,9 @@ static VALUE
|
|||
ruby_whisper_model_type(VALUE self)
|
||||
{
|
||||
ruby_whisper_model *rwm;
|
||||
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
||||
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rwm->context, rw);
|
||||
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
||||
return rb_str_new2(whisper_model_type_readable(rw->context));
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,63 +1,28 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 6
|
||||
|
||||
extern ID id___method__;
|
||||
extern ID id_to_enum;
|
||||
static VALUE sym_start_time;
|
||||
static VALUE sym_end_time;
|
||||
static VALUE sym_text;
|
||||
static VALUE sym_no_speech_prob;
|
||||
static VALUE sym_speaker_turn_next;
|
||||
static VALUE sym_n_tokens;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
|
||||
extern VALUE cSegment;
|
||||
|
||||
extern VALUE ruby_whisper_token_s_init(struct whisper_context *context, int i_segment, int index);
|
||||
|
||||
static void
|
||||
rb_whisper_segment_mark(void *p)
|
||||
rb_whisper_segment_mark(ruby_whisper_segment *rws)
|
||||
{
|
||||
ruby_whisper_segment *rws = (ruby_whisper_segment *)p;
|
||||
rb_gc_mark(rws->context);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_segment_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_segment *rws = (const ruby_whisper_segment *)p;
|
||||
size_t size = sizeof(rws);
|
||||
if (!rws) {
|
||||
return 0;
|
||||
}
|
||||
if (rws->index) {
|
||||
size += sizeof(rws->index);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static const rb_data_type_t ruby_whisper_segment_type = {
|
||||
"ruby_whisper_segment",
|
||||
{rb_whisper_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_segment_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
VALUE
|
||||
ruby_whisper_segment_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
return TypedData_Make_Struct(klass, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
rws = ALLOC(ruby_whisper_segment);
|
||||
return Data_Wrap_Struct(klass, rb_whisper_segment_mark, RUBY_DEFAULT_FREE, rws);
|
||||
}
|
||||
|
||||
VALUE
|
||||
rb_whisper_segment_s_new(VALUE context, int index)
|
||||
rb_whisper_segment_initialize(VALUE context, int index)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
const VALUE segment = ruby_whisper_segment_allocate(cSegment);
|
||||
TypedData_Get_Struct(segment, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(segment, ruby_whisper_segment, rws);
|
||||
rws->context = context;
|
||||
rws->index = index;
|
||||
return segment;
|
||||
|
|
@ -73,12 +38,12 @@ static VALUE
|
|||
ruby_whisper_segment_get_start_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
|
||||
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
||||
return LONG2NUM(t0 * 10);
|
||||
return INT2NUM(t0 * 10);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -91,12 +56,12 @@ static VALUE
|
|||
ruby_whisper_segment_get_end_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
|
||||
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
||||
return LONG2NUM(t1 * 10);
|
||||
return INT2NUM(t1 * 10);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -109,9 +74,9 @@ static VALUE
|
|||
ruby_whisper_segment_get_speaker_turn_next(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
|
||||
}
|
||||
|
||||
|
|
@ -123,9 +88,9 @@ static VALUE
|
|||
ruby_whisper_segment_get_text(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
const char * text = whisper_full_get_segment_text(rw->context, rws->index);
|
||||
return rb_str_new2(text);
|
||||
}
|
||||
|
|
@ -138,147 +103,21 @@ static VALUE
|
|||
ruby_whisper_segment_get_no_speech_prob(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
||||
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
|
||||
}
|
||||
|
||||
/*
|
||||
* Get number of tokens in the segment
|
||||
*
|
||||
* call-seq:
|
||||
* n_tokens -> Integer
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_segment_get_n_tokens(VALUE self)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
return INT2NUM(whisper_full_n_tokens(rw->context, rws->index));
|
||||
}
|
||||
|
||||
/*
|
||||
* Yields each Whisper::Token:
|
||||
*
|
||||
* whisper.each_segment.first.each_token do |token|
|
||||
* p token
|
||||
* end
|
||||
*
|
||||
* Returns an Enumerator if no block is given:
|
||||
*
|
||||
* whisper.each_segment.first.each_token.to_a # => [#<Whisper::Token>, ...]
|
||||
*
|
||||
* call-seq:
|
||||
* each_token {|token| ... }
|
||||
* each_token -> Enumerator
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_segment_each_token(VALUE self)
|
||||
{
|
||||
if (!rb_block_given_p()) {
|
||||
const VALUE method_name = rb_funcall(self, id___method__, 0);
|
||||
return rb_funcall(self, id_to_enum, 1, method_name);
|
||||
}
|
||||
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
|
||||
const int n_tokens = whisper_full_n_tokens(rw->context, rws->index);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
rb_yield(ruby_whisper_token_s_init(rw->context, rws->index, i));
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* deconstruct_keys(keys) -> hash
|
||||
*
|
||||
* Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :n_tokens
|
||||
*
|
||||
* whisper.each_segment do |segment|
|
||||
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
*
|
||||
* puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
||||
* end
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
ruby_whisper *rw;
|
||||
GetContext(rws->context, rw);
|
||||
|
||||
VALUE hash = rb_hash_new();
|
||||
long n_keys;
|
||||
if (NIL_P(keys)) {
|
||||
keys = rb_ary_new3(
|
||||
N_KEY_NAMES,
|
||||
sym_start_time,
|
||||
sym_end_time,
|
||||
sym_text,
|
||||
sym_no_speech_prob,
|
||||
sym_speaker_turn_next
|
||||
);
|
||||
n_keys = N_KEY_NAMES;
|
||||
} else {
|
||||
n_keys = RARRAY_LEN(keys);
|
||||
if (n_keys > N_KEY_NAMES) {
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < n_keys; i++) {
|
||||
VALUE key = rb_ary_entry(keys, i);
|
||||
if (key == sym_start_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
|
||||
}
|
||||
if (key == sym_end_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
|
||||
}
|
||||
if (key == sym_text) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
|
||||
}
|
||||
if (key == sym_no_speech_prob) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
|
||||
}
|
||||
if (key == sym_speaker_turn_next) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
|
||||
}
|
||||
if (key == sym_n_tokens) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_n_tokens(self));
|
||||
}
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
void
|
||||
init_ruby_whisper_segment(VALUE *mWhisper)
|
||||
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
|
||||
{
|
||||
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
|
||||
|
||||
sym_start_time = ID2SYM(rb_intern("start_time"));
|
||||
sym_end_time = ID2SYM(rb_intern("end_time"));
|
||||
sym_text = ID2SYM(rb_intern("text"));
|
||||
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
|
||||
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
|
||||
sym_n_tokens = ID2SYM(rb_intern("n_tokens"));
|
||||
|
||||
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
|
||||
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
|
||||
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
|
||||
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
||||
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
||||
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
|
||||
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
|
||||
rb_define_method(cSegment, "n_tokens", ruby_whisper_segment_get_n_tokens, 0);
|
||||
rb_define_method(cSegment, "each_token", ruby_whisper_segment_each_token, 0);
|
||||
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
|
||||
}
|
||||
#undef N_KEY_NAMES
|
||||
|
|
|
|||
|
|
@ -1,371 +0,0 @@
|
|||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 11
|
||||
|
||||
extern VALUE cToken;
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
|
||||
static VALUE sym_id;
|
||||
static VALUE sym_tid;
|
||||
static VALUE sym_probability;
|
||||
static VALUE sym_log_probability;
|
||||
static VALUE sym_pt;
|
||||
static VALUE sym_ptsum;
|
||||
static VALUE sym_t_dtw;
|
||||
static VALUE sym_voice_length;
|
||||
static VALUE sym_start_time;
|
||||
static VALUE sym_end_time;
|
||||
static VALUE sym_text;
|
||||
|
||||
static size_t
|
||||
ruby_whisper_token_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_token *rwt = (const ruby_whisper_token *)p;
|
||||
if (!rwt) {
|
||||
return 0;
|
||||
}
|
||||
size_t size = sizeof(*rwt);
|
||||
if (rwt->token_data) {
|
||||
size += sizeof(*rwt->token_data);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void
|
||||
ruby_whisper_token_mark(void *p)
|
||||
{
|
||||
ruby_whisper_token *rwt = (ruby_whisper_token *)p;
|
||||
rb_gc_mark(rwt->text);
|
||||
}
|
||||
|
||||
static void
|
||||
ruby_whisper_token_free(void *p)
|
||||
{
|
||||
ruby_whisper_token *rwt = (ruby_whisper_token *)p;
|
||||
if (rwt->token_data) {
|
||||
xfree(rwt->token_data);
|
||||
rwt->token_data = NULL;
|
||||
}
|
||||
xfree(rwt);
|
||||
}
|
||||
|
||||
static const rb_data_type_t ruby_whisper_token_type = {
|
||||
"ruby_whisper_token",
|
||||
{ruby_whisper_token_mark, ruby_whisper_token_free, ruby_whisper_token_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_token_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
VALUE token = TypedData_Make_Struct(klass, ruby_whisper_token, &ruby_whisper_token_type, rwt);
|
||||
rwt->token_data = NULL;
|
||||
rwt->text = Qnil;
|
||||
return token;
|
||||
}
|
||||
|
||||
VALUE
|
||||
ruby_whisper_token_s_init(struct whisper_context *context, int i_segment, int i_token)
|
||||
{
|
||||
const VALUE token = ruby_whisper_token_allocate(cToken);
|
||||
ruby_whisper_token *rwt;
|
||||
TypedData_Get_Struct(token, ruby_whisper_token, &ruby_whisper_token_type, rwt);
|
||||
rwt->token_data = ALLOC(whisper_token_data);
|
||||
*(rwt->token_data) = whisper_full_get_token_data(context, i_segment, i_token);
|
||||
rwt->text = rb_str_new2(whisper_full_get_token_text(context, i_segment, i_token));
|
||||
return token;
|
||||
}
|
||||
|
||||
/*
|
||||
* Token ID.
|
||||
*
|
||||
* call-seq:
|
||||
* id -> Integer
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_id(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return INT2NUM(rwt->token_data->id);
|
||||
}
|
||||
|
||||
/*
|
||||
* Forced timestamp token ID.
|
||||
*
|
||||
* call-seq:
|
||||
* tid -> Integer
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_tid(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return INT2NUM(rwt->token_data->tid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Probability of the token.
|
||||
*
|
||||
* call-seq:
|
||||
* probability -> Float
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_p(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return DBL2NUM(rwt->token_data->p);
|
||||
}
|
||||
|
||||
/*
|
||||
* Log probability of the token.
|
||||
*
|
||||
* call-seq:
|
||||
* log_probability -> Float
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_plog(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return DBL2NUM(rwt->token_data->plog);
|
||||
}
|
||||
|
||||
/*
|
||||
* Probability of the timestamp token.
|
||||
*
|
||||
* call-seq:
|
||||
* pt -> Float
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_pt(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return DBL2NUM(rwt->token_data->pt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sum of probability of all timestamp tokens.
|
||||
*
|
||||
* call-seq:
|
||||
* ptsum -> Float
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_ptsum(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return DBL2NUM(rwt->token_data->ptsum);
|
||||
}
|
||||
|
||||
/*
|
||||
* [EXPERIMENTAL] Token-level timestamps with DTW
|
||||
*
|
||||
* Do not use if you haven't computed token-level timestamps with dtw.
|
||||
* Roughly corresponds to the moment in audio in which the token was output.
|
||||
*
|
||||
* call-seq:
|
||||
* t_dtw -> Integer
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_t_dtw(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return LONG2NUM(rwt->token_data->t_dtw);
|
||||
}
|
||||
|
||||
/*
|
||||
* Voice length of the token.
|
||||
*
|
||||
* call-seq:
|
||||
* voice_length -> Float
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_vlen(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return DBL2NUM(rwt->token_data->vlen);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the token text of the token.
|
||||
*
|
||||
* call-seq:
|
||||
* text -> String
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_text(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return rwt->text;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start time of the token.
|
||||
*
|
||||
* Token-level timestamp data.
|
||||
* Do not use if you haven't computed token-level timestamps.
|
||||
*
|
||||
* call-seq:
|
||||
* start_time -> Integer
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_start_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return LONG2NUM(rwt->token_data->t0 * 10);
|
||||
}
|
||||
|
||||
/*
|
||||
* End time of the token.
|
||||
*
|
||||
* Token-level timestamp data.
|
||||
* Do not use if you haven't computed token-level timestamps.
|
||||
*
|
||||
* call-seq:
|
||||
* end_time -> Integer
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_token_get_end_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return LONG2NUM(rwt->token_data->t1 * 10);
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* deconstruct_keys(keys) -> hash
|
||||
*
|
||||
* Possible keys: :id, :tid, :probability, :log_probability, :pt, :ptsum,
|
||||
* :t_dtw, :voice_length, :start_time, :end_time, :text
|
||||
* segment.each_token do |token|
|
||||
* token => {text:, probability:}
|
||||
puts "#{text} (#{probability})"
|
||||
* end
|
||||
*/
|
||||
static VALUE ruby_whisper_token_deconstruct_keys(VALUE self, VALUE keys)
|
||||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
VALUE hash = rb_hash_new();
|
||||
long n_keys = 0;
|
||||
|
||||
if (NIL_P(keys)) {
|
||||
keys = rb_ary_new3(
|
||||
N_KEY_NAMES,
|
||||
sym_id,
|
||||
sym_tid,
|
||||
sym_probability,
|
||||
sym_log_probability,
|
||||
sym_pt,
|
||||
sym_ptsum,
|
||||
sym_t_dtw,
|
||||
sym_voice_length,
|
||||
sym_start_time,
|
||||
sym_end_time,
|
||||
sym_text
|
||||
);
|
||||
n_keys = N_KEY_NAMES;
|
||||
} else {
|
||||
n_keys = RARRAY_LEN(keys);
|
||||
if (n_keys > N_KEY_NAMES) {
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_keys; i++) {
|
||||
VALUE key = rb_ary_entry(keys, i);
|
||||
if (key == sym_start_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_start_time(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_end_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_end_time(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_text) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_text(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_probability) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_p(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_id) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_id(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_tid) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_tid(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_log_probability) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_plog(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_pt) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_pt(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_ptsum) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_ptsum(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_t_dtw) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_t_dtw(self));
|
||||
continue;
|
||||
}
|
||||
if (key == sym_voice_length) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_token_get_vlen(self));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
init_ruby_whisper_token(VALUE *mWhisper)
|
||||
{
|
||||
cToken = rb_define_class_under(*mWhisper, "Token", rb_cObject);
|
||||
|
||||
rb_define_alloc_func(cToken, ruby_whisper_token_allocate);
|
||||
|
||||
sym_id = ID2SYM(rb_intern("id"));
|
||||
sym_tid = ID2SYM(rb_intern("tid"));
|
||||
sym_probability = ID2SYM(rb_intern("probability"));
|
||||
sym_log_probability = ID2SYM(rb_intern("log_probability"));
|
||||
sym_pt = ID2SYM(rb_intern("pt"));
|
||||
sym_ptsum = ID2SYM(rb_intern("ptsum"));
|
||||
sym_t_dtw = ID2SYM(rb_intern("t_dtw"));
|
||||
sym_voice_length = ID2SYM(rb_intern("voice_length"));
|
||||
sym_start_time = ID2SYM(rb_intern("start_time"));
|
||||
sym_end_time = ID2SYM(rb_intern("end_time"));
|
||||
sym_text = ID2SYM(rb_intern("text"));
|
||||
|
||||
rb_define_method(cToken, "id", ruby_whisper_token_get_id, 0);
|
||||
rb_define_method(cToken, "tid", ruby_whisper_token_get_tid, 0);
|
||||
rb_define_method(cToken, "probability", ruby_whisper_token_get_p, 0);
|
||||
rb_define_method(cToken, "log_probability", ruby_whisper_token_get_plog, 0);
|
||||
rb_define_method(cToken, "pt", ruby_whisper_token_get_pt, 0);
|
||||
rb_define_method(cToken, "ptsum", ruby_whisper_token_get_ptsum, 0);
|
||||
rb_define_method(cToken, "t_dtw", ruby_whisper_token_get_t_dtw, 0);
|
||||
rb_define_method(cToken, "voice_length", ruby_whisper_token_get_vlen, 0);
|
||||
rb_define_method(cToken, "start_time", ruby_whisper_token_get_start_time, 0);
|
||||
rb_define_method(cToken, "end_time", ruby_whisper_token_get_end_time, 0);
|
||||
rb_define_method(cToken, "text", ruby_whisper_token_get_text, 0);
|
||||
|
||||
rb_define_method(cToken, "deconstruct_keys", ruby_whisper_token_deconstruct_keys, 1);
|
||||
}
|
||||
|
||||
#undef N_KEY_NAMES
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
#include "common-whisper.h"
|
||||
#include <string>
|
||||
|
|
@ -7,45 +8,11 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
extern const rb_data_type_t ruby_whisper_params_type;
|
||||
|
||||
extern ID id_to_s;
|
||||
extern ID id_call;
|
||||
extern ID id_to_path;
|
||||
extern ID transcribe_option_names[1];
|
||||
|
||||
extern void prepare_transcription(ruby_whisper_params * rwp, VALUE * self, int n_processors);
|
||||
|
||||
typedef struct{
|
||||
struct whisper_context *context;
|
||||
struct whisper_full_params *params;
|
||||
float *samples;
|
||||
size_t n_samples;
|
||||
int n_processors;
|
||||
int result;
|
||||
} transcribe_without_gvl_args;
|
||||
|
||||
static void*
|
||||
transcribe_without_gvl(void *rb_args)
|
||||
{
|
||||
transcribe_without_gvl_args *args = (transcribe_without_gvl_args *)rb_args;
|
||||
args->result = whisper_full_parallel(args->context, *args->params, args->samples, args->n_samples, args->n_processors);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
ruby_whisper_abort_callback_container *abort_callback_container;
|
||||
} transcribe_ubf_args;
|
||||
|
||||
static void
|
||||
transcribe_ubf(void *rb_args)
|
||||
{
|
||||
transcribe_ubf_args *args = (transcribe_ubf_args *)rb_args;
|
||||
|
||||
args->abort_callback_container->is_interrupted = true;
|
||||
}
|
||||
extern void
|
||||
register_callbacks(ruby_whisper_params * rwp, VALUE * self);
|
||||
|
||||
/*
|
||||
* transcribe a single file
|
||||
|
|
@ -64,24 +31,16 @@ VALUE
|
|||
ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
VALUE wave_file_path, blk, params, kws;
|
||||
VALUE opts[1];
|
||||
VALUE wave_file_path, blk, params;
|
||||
|
||||
rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, ¶ms, &kws, &blk);
|
||||
rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
|
||||
|
||||
int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
|
||||
|
||||
GetContext(self, rw);
|
||||
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
rb_scan_args(argc, argv, "02&", &wave_file_path, ¶ms, &blk);
|
||||
Data_Get_Struct(self, ruby_whisper, rw);
|
||||
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
||||
|
||||
if (!rb_respond_to(wave_file_path, id_to_s)) {
|
||||
rb_raise(rb_eRuntimeError, "Expected file path to wave file");
|
||||
}
|
||||
|
||||
if (rb_respond_to(wave_file_path, id_to_path)) {
|
||||
wave_file_path = rb_funcall(wave_file_path, id_to_path, 0);
|
||||
}
|
||||
std::string fname_inp = StringValueCStr(wave_file_path);
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
|
|
@ -91,36 +50,20 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|||
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
|
||||
return self;
|
||||
}
|
||||
// Commented out because it is work in progress
|
||||
// {
|
||||
// static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||
{
|
||||
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||
|
||||
// rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
// bool is_aborted = *(bool*)user_data;
|
||||
// return !is_aborted;
|
||||
// };
|
||||
// rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
||||
// }
|
||||
|
||||
prepare_transcription(rwp, &self, n_processors);
|
||||
|
||||
transcribe_without_gvl_args args = {
|
||||
rw->context,
|
||||
&rwp->params,
|
||||
pcmf32.data(),
|
||||
pcmf32.size(),
|
||||
n_processors,
|
||||
0,
|
||||
};
|
||||
transcribe_ubf_args ubf_args = {
|
||||
rwp->abort_callback_container,
|
||||
};
|
||||
rb_thread_call_without_gvl(transcribe_without_gvl, (void *)&args, transcribe_ubf, (void *)&ubf_args);
|
||||
if (args.result != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return self;
|
||||
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
||||
}
|
||||
if (NIL_P(blk)) {
|
||||
|
||||
register_callbacks(rwp, &self);
|
||||
|
||||
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return self;
|
||||
}
|
||||
const int n_segments = whisper_full_n_segments(rw->context);
|
||||
|
|
@ -129,7 +72,10 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|||
const char * text = whisper_full_get_segment_text(rw->context, i);
|
||||
output = rb_str_concat(output, rb_str_new2(text));
|
||||
}
|
||||
rb_funcall(blk, id_call, 1, output);
|
||||
VALUE idCall = id_call;
|
||||
if (blk != Qnil) {
|
||||
rb_funcall(blk, idCall, 1, output);
|
||||
}
|
||||
return self;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -1,122 +0,0 @@
|
|||
#include "ruby_whisper.h"
|
||||
|
||||
extern ID id_to_s;
|
||||
|
||||
extern VALUE cVADContext;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_vad_params_type;
|
||||
extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params);
|
||||
extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
|
||||
extern parsed_samples_t parse_samples(VALUE *samples, VALUE *n_samples);
|
||||
extern VALUE release_samples(VALUE parsed);
|
||||
|
||||
extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments);
|
||||
|
||||
typedef struct segments_from_samples_args {
|
||||
VALUE *context;
|
||||
VALUE *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
} segments_from_samples_args;
|
||||
|
||||
static size_t
|
||||
ruby_whisper_vad_context_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_vad_context *rwvc = p;
|
||||
size_t size = sizeof(rwvc);
|
||||
if (!rwvc) {
|
||||
return 0;
|
||||
}
|
||||
if (rwvc->context) {
|
||||
size += sizeof(rwvc->context);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void
|
||||
ruby_whisper_vad_context_free(void *p)
|
||||
{
|
||||
ruby_whisper_vad_context *rwvc = (ruby_whisper_vad_context *)p;
|
||||
if (rwvc->context) {
|
||||
whisper_vad_free(rwvc->context);
|
||||
rwvc->context = NULL;
|
||||
}
|
||||
xfree(rwvc);
|
||||
}
|
||||
|
||||
const rb_data_type_t ruby_whisper_vad_context_type = {
|
||||
"ruby_whisper_vad_context",
|
||||
{0, ruby_whisper_vad_context_free, ruby_whisper_vad_context_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_context_s_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_vad_context *rwvc;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
|
||||
rwvc->context = NULL;
|
||||
return obj;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path)
|
||||
{
|
||||
ruby_whisper_vad_context *rwvc;
|
||||
struct whisper_vad_context *context;
|
||||
|
||||
model_path = ruby_whisper_normalize_model_path(model_path);
|
||||
context = whisper_vad_init_from_file_with_params(StringValueCStr(model_path), whisper_vad_default_context_params());
|
||||
if (context == NULL) {
|
||||
rb_raise(rb_eRuntimeError, "Failed to initialize whisper VAD context");
|
||||
}
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
|
||||
rwvc->context = context;
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
segments_from_samples_body(VALUE rb_args)
|
||||
{
|
||||
segments_from_samples_args *args = (segments_from_samples_args *)rb_args;
|
||||
|
||||
ruby_whisper_vad_context *rwvc;
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
GetVADContext(*args->context, rwvc);
|
||||
GetVADParams(*args->params, rwvp);
|
||||
|
||||
struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, args->samples, args->n_samples);
|
||||
|
||||
return ruby_whisper_vad_segments_s_init(segments);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self)
|
||||
{
|
||||
if (argc < 2 || argc > 3) {
|
||||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
||||
}
|
||||
|
||||
VALUE n_samples = argc == 2 ? Qnil : argv[2];
|
||||
struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples);
|
||||
segments_from_samples_args args = {
|
||||
&self,
|
||||
&argv[0],
|
||||
parsed.samples,
|
||||
parsed.n_samples,
|
||||
};
|
||||
VALUE segments = rb_ensure(segments_from_samples_body, (VALUE)&args, release_samples, (VALUE)&parsed);
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
void init_ruby_whisper_vad_context(VALUE *mVAD)
|
||||
{
|
||||
cVADContext = rb_define_class_under(*mVAD, "Context", rb_cObject);
|
||||
rb_define_alloc_func(cVADContext, ruby_whisper_vad_context_s_allocate);
|
||||
rb_define_method(cVADContext, "initialize", ruby_whisper_vad_context_initialize, 1);
|
||||
rb_define_method(cVADContext, "segments_from_samples", ruby_whisper_vad_segments_from_samples, -1);
|
||||
rb_define_method(cVADContext, "detect", ruby_whisper_vad_detect, 2);
|
||||
}
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
#include "ruby_whisper.h"
|
||||
#include "common-whisper.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern ID id_to_path;
|
||||
|
||||
extern VALUE cVADSegments;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_vad_context_type;
|
||||
extern const rb_data_type_t ruby_whisper_vad_params_type;
|
||||
extern const rb_data_type_t ruby_whisper_vad_segments_type;
|
||||
|
||||
extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments);
|
||||
|
||||
VALUE
|
||||
ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) {
|
||||
ruby_whisper_vad_context *rwvc;
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
std::string cpp_file_path;
|
||||
std::vector<float> pcmf32;
|
||||
std::vector<std::vector<float>> pcmf32s;
|
||||
whisper_vad_segments *segments;
|
||||
|
||||
GetVADContext(self, rwvc);
|
||||
TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
|
||||
if (rb_respond_to(file_path, id_to_path)) {
|
||||
file_path = rb_funcall(file_path, id_to_path, 0);
|
||||
}
|
||||
cpp_file_path = StringValueCStr(file_path);
|
||||
|
||||
if (!read_audio_data(cpp_file_path, pcmf32, pcmf32s, false)) {
|
||||
rb_raise(rb_eRuntimeError, "Failed to open '%s' as WAV file\n", cpp_file_path.c_str());
|
||||
}
|
||||
|
||||
segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, pcmf32.data(), pcmf32.size());
|
||||
if (segments == nullptr) {
|
||||
rb_raise(rb_eRuntimeError, "Failed to process audio\n");
|
||||
}
|
||||
|
||||
return ruby_whisper_vad_segments_s_init(segments);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,287 +0,0 @@
|
|||
#include "ruby_whisper.h"
|
||||
|
||||
#define DEFINE_PARAM(param_name, nth) \
|
||||
id_ ## param_name = rb_intern(#param_name); \
|
||||
param_names[nth] = id_ ## param_name; \
|
||||
rb_define_method(cVADParams, #param_name, ruby_whisper_vad_params_get_ ## param_name, 0); \
|
||||
rb_define_method(cVADParams, #param_name "=", ruby_whisper_vad_params_set_ ## param_name, 1);
|
||||
|
||||
#define NUM_PARAMS 6
|
||||
|
||||
extern VALUE cVADParams;
|
||||
|
||||
static size_t
|
||||
ruby_whisper_vad_params_memsize(const void *p)
|
||||
{
|
||||
const struct ruby_whisper_vad_params *params = p;
|
||||
size_t size = sizeof(params);
|
||||
if (!params) {
|
||||
return 0;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static ID param_names[NUM_PARAMS];
|
||||
static ID id_threshold;
|
||||
static ID id_min_speech_duration_ms;
|
||||
static ID id_min_silence_duration_ms;
|
||||
static ID id_max_speech_duration_s;
|
||||
static ID id_speech_pad_ms;
|
||||
static ID id_samples_overlap;
|
||||
|
||||
const rb_data_type_t ruby_whisper_vad_params_type = {
|
||||
"ruby_whisper_vad_params",
|
||||
{0, 0, ruby_whisper_vad_params_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_s_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params = whisper_vad_default_params();
|
||||
return obj;
|
||||
}
|
||||
|
||||
/*
|
||||
* Probability threshold to consider as speech.
|
||||
*
|
||||
* call-seq:
|
||||
* threshold = th -> th
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_threshold(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.threshold = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_threshold(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return DBL2NUM(rwvp->params.threshold);
|
||||
}
|
||||
|
||||
/*
|
||||
* Min duration for a valid speech segment.
|
||||
*
|
||||
* call-seq:
|
||||
* min_speech_duration_ms = duration_ms -> duration_ms
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_min_speech_duration_ms(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.min_speech_duration_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_min_speech_duration_ms(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return INT2NUM(rwvp->params.min_speech_duration_ms);
|
||||
}
|
||||
|
||||
/*
|
||||
* Min silence duration to consider speech as ended.
|
||||
*
|
||||
* call-seq:
|
||||
* min_silence_duration_ms = duration_ms -> duration_ms
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_min_silence_duration_ms(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.min_silence_duration_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_min_silence_duration_ms(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return INT2NUM(rwvp->params.min_silence_duration_ms);
|
||||
}
|
||||
|
||||
/*
|
||||
* Max duration of a speech segment before forcing a new segment.
|
||||
*
|
||||
* call-seq:
|
||||
* max_speech_duration_s = duration_s -> duration_s
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_max_speech_duration_s(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.max_speech_duration_s = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_max_speech_duration_s(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return DBL2NUM(rwvp->params.max_speech_duration_s);
|
||||
}
|
||||
|
||||
/*
|
||||
* Padding added before and after speech segments.
|
||||
*
|
||||
* call-seq:
|
||||
* speech_pad_ms = pad_ms -> pad_ms
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_speech_pad_ms(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.speech_pad_ms = NUM2INT(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_speech_pad_ms(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return INT2NUM(rwvp->params.speech_pad_ms);
|
||||
}
|
||||
|
||||
/*
|
||||
* Overlap in seconds when copying audio samples from speech segment.
|
||||
*
|
||||
* call-seq:
|
||||
* samples_overlap = overlap -> overlap
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_set_samples_overlap(VALUE self, VALUE value)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
rwvp->params.samples_overlap = RFLOAT_VALUE(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_get_samples_overlap(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
return DBL2NUM(rwvp->params.samples_overlap);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_params_equal(VALUE self, VALUE other)
|
||||
{
|
||||
ruby_whisper_vad_params *rwvp1;
|
||||
ruby_whisper_vad_params *rwvp2;
|
||||
|
||||
if (self == other) {
|
||||
return Qtrue;
|
||||
}
|
||||
|
||||
if (!rb_obj_is_kind_of(other, cVADParams)) {
|
||||
return Qfalse;
|
||||
}
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp1);
|
||||
TypedData_Get_Struct(other, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp2);
|
||||
|
||||
if (rwvp1->params.threshold != rwvp2->params.threshold) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.min_speech_duration_ms != rwvp2->params.min_speech_duration_ms) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.min_silence_duration_ms != rwvp2->params.min_silence_duration_ms) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.max_speech_duration_s != rwvp2->params.max_speech_duration_s) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.speech_pad_ms != rwvp2->params.speech_pad_ms) {
|
||||
return Qfalse;
|
||||
}
|
||||
if (rwvp1->params.samples_overlap != rwvp2->params.samples_overlap) {
|
||||
return Qfalse;
|
||||
}
|
||||
|
||||
return Qtrue;
|
||||
}
|
||||
|
||||
#define SET_PARAM_IF_SAME(param_name) \
|
||||
if (id == id_ ## param_name) { \
|
||||
ruby_whisper_vad_params_set_ ## param_name(self, value); \
|
||||
continue; \
|
||||
}
|
||||
|
||||
VALUE
|
||||
ruby_whisper_vad_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
{
|
||||
VALUE kw_hash;
|
||||
VALUE values[NUM_PARAMS] = {Qundef};
|
||||
VALUE value;
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
ID id;
|
||||
int i;
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
|
||||
rb_scan_args_kw(RB_SCAN_ARGS_KEYWORDS, argc, argv, ":", &kw_hash);
|
||||
if (NIL_P(kw_hash)) {
|
||||
return self;
|
||||
}
|
||||
|
||||
rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
|
||||
|
||||
for (i = 0; i < NUM_PARAMS; i++) {
|
||||
id = param_names[i];
|
||||
value = values[i];
|
||||
if (value == Qundef) {
|
||||
continue;
|
||||
}
|
||||
SET_PARAM_IF_SAME(threshold)
|
||||
SET_PARAM_IF_SAME(min_speech_duration_ms)
|
||||
SET_PARAM_IF_SAME(min_silence_duration_ms)
|
||||
SET_PARAM_IF_SAME(max_speech_duration_s)
|
||||
SET_PARAM_IF_SAME(speech_pad_ms)
|
||||
SET_PARAM_IF_SAME(samples_overlap)
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
#undef SET_PARAM_IF_SAME
|
||||
|
||||
void
|
||||
init_ruby_whisper_vad_params(VALUE *mVAD)
|
||||
{
|
||||
cVADParams = rb_define_class_under(*mVAD, "Params", rb_cObject);
|
||||
rb_define_alloc_func(cVADParams, ruby_whisper_vad_params_s_allocate);
|
||||
rb_define_method(cVADParams, "initialize", ruby_whisper_vad_params_initialize, -1);
|
||||
|
||||
DEFINE_PARAM(threshold, 0)
|
||||
DEFINE_PARAM(min_speech_duration_ms, 1)
|
||||
DEFINE_PARAM(min_silence_duration_ms, 2)
|
||||
DEFINE_PARAM(max_speech_duration_s, 3)
|
||||
DEFINE_PARAM(speech_pad_ms, 4)
|
||||
DEFINE_PARAM(samples_overlap, 5)
|
||||
|
||||
rb_define_method(cVADParams, "==", ruby_whisper_vad_params_equal, 1);
|
||||
}
|
||||
|
||||
#undef DEFINE_PARAM
|
||||
#undef NUM_PARAMS
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 2
|
||||
|
||||
extern VALUE cVADSegment;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_vad_segments_type;
|
||||
|
||||
static VALUE sym_start_time;
|
||||
static VALUE sym_end_time;
|
||||
|
||||
static void
|
||||
rb_whisper_vad_segment_mark(void *p)
|
||||
{
|
||||
ruby_whisper_vad_segment *rwvs = (ruby_whisper_vad_segment *)p;
|
||||
rb_gc_mark(rwvs->segments);
|
||||
}
|
||||
|
||||
static size_t
|
||||
ruby_whisper_vad_segment_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_vad_segment *rwvs = p;
|
||||
size_t size = sizeof(rwvs);
|
||||
if (!rwvs) {
|
||||
return 0;
|
||||
}
|
||||
if (rwvs->index) {
|
||||
size += sizeof(rwvs->index);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static const rb_data_type_t ruby_whisper_vad_segment_type = {
|
||||
"ruby_whisper_vad_segment",
|
||||
{rb_whisper_vad_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_vad_segment_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segment_s_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_vad_segment *rwvs;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
|
||||
rwvs->segments = Qnil;
|
||||
rwvs->index = -1;
|
||||
return obj;
|
||||
}
|
||||
|
||||
VALUE
|
||||
rb_whisper_vad_segment_s_new(VALUE segments, int index)
|
||||
{
|
||||
ruby_whisper_vad_segment *rwvs;
|
||||
const VALUE segment = ruby_whisper_vad_segment_s_allocate(cVADSegment);
|
||||
TypedData_Get_Struct(segment, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
|
||||
rwvs->segments = segments;
|
||||
rwvs->index = index;
|
||||
return segment;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segment_get_start_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_segment *rwvs;
|
||||
ruby_whisper_vad_segments *rwvss;
|
||||
float t0;
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
|
||||
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
|
||||
t0 = whisper_vad_segments_get_segment_t0(rwvss->segments, rwvs->index);
|
||||
return DBL2NUM(t0 * 10);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segment_get_end_time(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_segment *rwvs;
|
||||
ruby_whisper_vad_segments *rwvss;
|
||||
float t1;
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
|
||||
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
|
||||
t1 = whisper_vad_segments_get_segment_t1(rwvss->segments, rwvs->index);
|
||||
return DBL2NUM(t1 * 10);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segment_deconstruct_keys(VALUE self, VALUE keys)
|
||||
{
|
||||
ruby_whisper_vad_segment *rwvs;
|
||||
ruby_whisper_vad_segments *rwvss;
|
||||
VALUE hash, key;
|
||||
long n_keys;
|
||||
int i;
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
|
||||
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
|
||||
|
||||
hash = rb_hash_new();
|
||||
if (NIL_P(keys)) {
|
||||
keys = rb_ary_new3(
|
||||
N_KEY_NAMES,
|
||||
sym_start_time,
|
||||
sym_end_time
|
||||
);
|
||||
n_keys = N_KEY_NAMES;
|
||||
} else {
|
||||
n_keys = RARRAY_LEN(keys);
|
||||
if (n_keys > N_KEY_NAMES) {
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < n_keys; i++) {
|
||||
key = rb_ary_entry(keys, i);
|
||||
if (key == sym_start_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_start_time(self));
|
||||
}
|
||||
if (key == sym_end_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_end_time(self));
|
||||
}
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
void
|
||||
init_ruby_whisper_vad_segment(VALUE *mVAD)
|
||||
{
|
||||
cVADSegment = rb_define_class_under(*mVAD, "Segment", rb_cObject);
|
||||
|
||||
sym_start_time = ID2SYM(rb_intern("start_time"));
|
||||
sym_end_time = ID2SYM(rb_intern("end_time"));
|
||||
|
||||
rb_define_alloc_func(cVADSegment, ruby_whisper_vad_segment_s_allocate);
|
||||
rb_define_method(cVADSegment, "start_time", ruby_whisper_vad_segment_get_start_time, 0);
|
||||
rb_define_method(cVADSegment, "end_time", ruby_whisper_vad_segment_get_end_time, 0);
|
||||
rb_define_method(cVADSegment, "deconstruct_keys", ruby_whisper_vad_segment_deconstruct_keys, 1);
|
||||
}
|
||||
|
|
@ -1,105 +0,0 @@
|
|||
#include "ruby_whisper.h"
|
||||
|
||||
extern ID id___method__;
|
||||
extern ID id_to_enum;
|
||||
|
||||
extern VALUE cVADSegments;
|
||||
|
||||
extern VALUE rb_whisper_vad_segment_s_new(VALUE segments, int index);
|
||||
|
||||
static size_t
|
||||
ruby_whisper_vad_segments_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_vad_segments *rwvss = p;
|
||||
size_t size = sizeof(rwvss);
|
||||
if (!rwvss) {
|
||||
return 0;
|
||||
}
|
||||
if (rwvss->segments) {
|
||||
size += sizeof(rwvss->segments);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void
|
||||
ruby_whisper_vad_segments_free(void *p)
|
||||
{
|
||||
ruby_whisper_vad_segments *rwvss = (ruby_whisper_vad_segments *)p;
|
||||
if (rwvss->segments) {
|
||||
whisper_vad_free_segments(rwvss->segments);
|
||||
rwvss->segments = NULL;
|
||||
}
|
||||
xfree(rwvss);
|
||||
}
|
||||
|
||||
const rb_data_type_t ruby_whisper_vad_segments_type = {
|
||||
"ruby_whisper_vad_segments",
|
||||
{0, ruby_whisper_vad_segments_free, ruby_whisper_vad_segments_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segments_s_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_vad_segments *rwvss;
|
||||
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
|
||||
rwvss->segments = NULL;
|
||||
return obj;
|
||||
}
|
||||
|
||||
VALUE
|
||||
ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments)
|
||||
{
|
||||
VALUE rb_segments;
|
||||
ruby_whisper_vad_segments *rwvss;
|
||||
|
||||
rb_segments = ruby_whisper_vad_segments_s_allocate(cVADSegments);
|
||||
TypedData_Get_Struct(rb_segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
|
||||
rwvss->segments = segments;
|
||||
|
||||
return rb_segments;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segments_each(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_segments *rwvss;
|
||||
VALUE method_name;
|
||||
int n_segments, i;
|
||||
|
||||
if (!rb_block_given_p()) {
|
||||
method_name = rb_funcall(self, id___method__, 0);
|
||||
return rb_funcall(self, id_to_enum, 1, method_name);
|
||||
}
|
||||
|
||||
GetVADSegments(self, rwvss);
|
||||
n_segments = whisper_vad_segments_n_segments(rwvss->segments);
|
||||
for (i = 0; i < n_segments; ++i) {
|
||||
rb_yield(rb_whisper_vad_segment_s_new(self, i));
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segments_get_length(VALUE self)
|
||||
{
|
||||
ruby_whisper_vad_segments *rwvss;
|
||||
int n_segments;
|
||||
|
||||
GetVADSegments(self, rwvss);
|
||||
n_segments = whisper_vad_segments_n_segments(rwvss->segments);
|
||||
|
||||
return INT2NUM(n_segments);
|
||||
}
|
||||
|
||||
void
|
||||
init_ruby_whisper_vad_segments(VALUE *mVAD)
|
||||
{
|
||||
cVADSegments = rb_define_class_under(*mVAD, "Segments", rb_cObject);
|
||||
rb_define_alloc_func(cVADSegments, ruby_whisper_vad_segments_s_allocate);
|
||||
rb_define_method(cVADSegments, "each", ruby_whisper_vad_segments_each, 0);
|
||||
rb_define_method(cVADSegments, "length", ruby_whisper_vad_segments_get_length, 0);
|
||||
rb_include_module(cVADSegments, rb_path2class("Enumerable"));
|
||||
}
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
set(GRAPHVIZ_EXECUTABLES FALSE)
|
||||
set(GRAPHVIZ_STATIC_LIBS TRUE)
|
||||
set(GRAPHVIZ_SHARED_LIBS FALSE)
|
||||
set(GRAPHVIZ_MODULE_LIBS FALSE)
|
||||
set(GRAPHVIZ_INTERFACE_LIBS FALSE)
|
||||
set(GRAPHVIZ_OBJECT_LIBS FALSE)
|
||||
set(GRAPHVIZ_UNKNOWN_LIBS FALSE)
|
||||
set(GRAPHVIZ_GENERATE_DEPENDERS FALSE)
|
||||
|
|
@ -1,57 +1,6 @@
|
|||
require "pathname"
|
||||
require "yaml"
|
||||
|
||||
root = Pathname("..")/".."
|
||||
ignored_dirs = %w[
|
||||
.devops
|
||||
.github
|
||||
ci
|
||||
examples/addon.node
|
||||
examples/bench.wasm
|
||||
examples/command
|
||||
examples/command.wasm
|
||||
examples/lsp
|
||||
examples/main
|
||||
examples/python
|
||||
examples/stream
|
||||
examples/stream.wasm
|
||||
examples/sycl
|
||||
examples/talk-llama
|
||||
examples/wchess
|
||||
examples/whisper.android
|
||||
examples/whisper.android.java
|
||||
examples/whisper.nvim
|
||||
examples/whisper.objc
|
||||
examples/whisper.swiftui
|
||||
examples/whisper.wasm
|
||||
grammars
|
||||
models
|
||||
samples
|
||||
scripts
|
||||
tests
|
||||
].collect {|dir| root/dir}
|
||||
ignored_files = %w[
|
||||
AUTHORS
|
||||
Makefile
|
||||
.gitignore
|
||||
.gitmodules
|
||||
.dockerignore
|
||||
]
|
||||
ignored_exts = %w[
|
||||
.yml
|
||||
.sh
|
||||
.md
|
||||
.py
|
||||
.js
|
||||
.nvim
|
||||
]
|
||||
|
||||
EXTSOURCES =
|
||||
`git ls-files -z #{root}`.split("\x0")
|
||||
.collect {|file| Pathname(file)}
|
||||
.reject {|file|
|
||||
ignored_exts.include?(file.extname) ||
|
||||
ignored_files.include?(file.basename.to_path) ||
|
||||
ignored_dirs.any? {|dir| file.descend.any? {|desc| desc == dir}} ||
|
||||
(file.descend.to_a[1] != root && file != Pathname("..")/"javascript"/"package-tmpl.json")
|
||||
}
|
||||
.collect(&:to_path)
|
||||
sources = `git ls-files -z ../..`.split("\x0")
|
||||
paths = YAML.load_file("../../.github/workflows/bindings-ruby.yml")[true]["push"]["paths"]
|
||||
paths.delete "bindings/ruby/**"
|
||||
EXTSOURCES = (Dir.glob(paths, base: "../..").collect {|path| "../../#{path}"} << "../../LICENSE") & sources
|
||||
|
|
|
|||
|
|
@ -1,15 +0,0 @@
|
|||
module Whisper
|
||||
class Context
|
||||
def to_srt
|
||||
each_segment.with_index.reduce("") {|srt, (segment, index)|
|
||||
srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
|
||||
}
|
||||
end
|
||||
|
||||
def to_webvtt
|
||||
each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
|
||||
webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -34,7 +34,7 @@ module Whisper
|
|||
when /darwin/
|
||||
Pathname(Dir.home)/"Library/Caches"
|
||||
else
|
||||
ENV.key?("XDG_CACHE_HOME") ? Pathname(ENV["XDG_CACHE_HOME"]) : Pathname(Dir.home)/".cache"
|
||||
ENV.key?("XDG_CACHE_HOME") ? ENV["XDG_CACHE_HOME"] : Pathname(Dir.home)/".cache"
|
||||
end
|
||||
base/"whisper.cpp"
|
||||
end
|
||||
|
|
@ -53,10 +53,8 @@ module Whisper
|
|||
http.request request do |response|
|
||||
case response
|
||||
when Net::HTTPNotModified
|
||||
# noop
|
||||
# noop
|
||||
when Net::HTTPOK
|
||||
return if !response.key?("last-modified") && cache_path.exist?
|
||||
|
||||
download response
|
||||
when Net::HTTPRedirection
|
||||
request URI(response["location"]), headers
|
||||
|
|
@ -70,7 +68,7 @@ module Whisper
|
|||
rescue => err
|
||||
if cache_path.exist?
|
||||
warn err
|
||||
# Use cache file
|
||||
# Use cache file
|
||||
else
|
||||
raise
|
||||
end
|
||||
|
|
@ -94,8 +92,7 @@ module Whisper
|
|||
end
|
||||
|
||||
def show_progress(current, size)
|
||||
line_size = 47
|
||||
progress_rate_available = size && $stderr.tty? && $stderr.winsize[1] >= line_size
|
||||
progress_rate_available = size && $stderr.tty?
|
||||
|
||||
unless @prev
|
||||
@prev = Time.now
|
||||
|
|
@ -131,44 +128,6 @@ module Whisper
|
|||
end
|
||||
end
|
||||
|
||||
class ZipURI < URI
|
||||
def cache
|
||||
zip_path = super
|
||||
dest = unzipped_path
|
||||
return if dest.exist? && dest.mtime >= zip_path.mtime
|
||||
escaping dest do
|
||||
system "unzip", "-q", "-d", zip_path.dirname.to_path, zip_path.to_path, exception: true
|
||||
end
|
||||
zip_path
|
||||
end
|
||||
|
||||
def clear_cache
|
||||
super
|
||||
unzipped_path.rmtree if unzipped_path.exist?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def unzipped_path
|
||||
cache_path.sub_ext("")
|
||||
end
|
||||
|
||||
def escaping(path)
|
||||
escaped = Pathname("#{path}.removing")
|
||||
if path.exist?
|
||||
escaped.rmtree if escaped.exist?
|
||||
path.rename escaped
|
||||
end
|
||||
yield
|
||||
ensure
|
||||
if path.exist?
|
||||
escaped.rmtree if escaped.exist?
|
||||
else
|
||||
escaped.rename path if escaped.exist?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@pre_converted_models = %w[
|
||||
tiny
|
||||
tiny.en
|
||||
|
|
@ -182,6 +141,7 @@ module Whisper
|
|||
base-q8_0
|
||||
small
|
||||
small.en
|
||||
small.en-tdrz
|
||||
small-q5_1
|
||||
small.en-q5_1
|
||||
small-q8_0
|
||||
|
|
@ -203,30 +163,8 @@ module Whisper
|
|||
models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
|
||||
}
|
||||
|
||||
%w[
|
||||
small.en-tdrz
|
||||
].each do |name|
|
||||
@pre_converted_models[name] = URI.new("https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-#{name}.bin")
|
||||
end
|
||||
|
||||
%w[
|
||||
silero-v5.1.2
|
||||
silero-v6.2.0
|
||||
].each do |name|
|
||||
@pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
|
||||
end
|
||||
|
||||
@coreml_compiled_models = @pre_converted_models.each_with_object({}) {|(name, uri), models|
|
||||
next if name.end_with?("-tdrz") || name.start_with?("silero-")
|
||||
|
||||
if matched = name.match(/\A(?<name>.*)-q\d_\d\z/)
|
||||
name = matched[:name]
|
||||
end
|
||||
models[uri] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
|
||||
}
|
||||
|
||||
class << self
|
||||
attr_reader :pre_converted_models, :coreml_compiled_models
|
||||
attr_reader :pre_converted_models
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
|||
|
|
@ -1,58 +0,0 @@
|
|||
module Whisper
|
||||
class Segment
|
||||
SRT_ESCAPES = {
|
||||
"&" => "&",
|
||||
"<" => "<",
|
||||
">" => ">",
|
||||
}
|
||||
SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
|
||||
private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
|
||||
|
||||
def to_srt_cue
|
||||
"#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
|
||||
end
|
||||
|
||||
def to_webvtt_cue
|
||||
"#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def time_to_a(time)
|
||||
sec, decimal_part = time.divmod(1000)
|
||||
min, sec = sec.divmod(60)
|
||||
hour, min = min.divmod(60)
|
||||
[hour, min, sec, decimal_part]
|
||||
end
|
||||
|
||||
def srt_time(time)
|
||||
"%02d:%02d:%02d,%03d" % time_to_a(time)
|
||||
end
|
||||
|
||||
def srt_start_time
|
||||
srt_time(start_time)
|
||||
end
|
||||
|
||||
def srt_end_time
|
||||
srt_time(end_time)
|
||||
end
|
||||
|
||||
def srt_text
|
||||
text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
|
||||
end
|
||||
|
||||
def webvtt_time(time)
|
||||
"%02d:%02d:%02d.%03d" % time_to_a(time)
|
||||
end
|
||||
|
||||
def webvtt_start_time
|
||||
webvtt_time(start_time)
|
||||
end
|
||||
|
||||
def webvtt_end_time
|
||||
webvtt_time(end_time)
|
||||
end
|
||||
|
||||
alias webvtt_text srt_text
|
||||
end
|
||||
end
|
||||
|
|
@ -5,59 +5,27 @@ module Whisper
|
|||
end
|
||||
|
||||
type log_callback = ^(Integer level, String message, Object user_data) -> void
|
||||
type new_segment_callback = ^(Whisper::Context, untyped, Integer n_new, Object user_data) -> void
|
||||
type progress_callback = ^(Whisper::Context, untyped, Integer progress, Object user_data) -> void
|
||||
type encoder_begin_callback = ^(Whisper::Context, untyped, Object user_data) -> void
|
||||
type abort_callback = ^(Whisper::Context, untyped, Object user_data) -> boolish
|
||||
type new_segment_callback = ^(Whisper::Context, void, Integer n_new, Object user_data) -> void
|
||||
type progress_callback = ^(Whisper::Context, void, Integer progress, Object user_data) -> void
|
||||
type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish
|
||||
|
||||
VERSION: String
|
||||
LOG_LEVEL_NONE: Integer
|
||||
LOG_LEVEL_INFO: Integer
|
||||
LOG_LEVEL_WARN: Integer
|
||||
LOG_LEVEL_ERROR: Integer
|
||||
LOG_LEVEL_DEBUG: Integer
|
||||
LOG_LEVEL_CONT: Integer
|
||||
AHEADS_NONE: Integer
|
||||
AHEADS_N_TOP_MOST: Integer
|
||||
AHEADS_CUSTOM: Integer
|
||||
AHEADS_TINY_EN: Integer
|
||||
AHEADS_TINY: Integer
|
||||
AHEADS_BASE_EN: Integer
|
||||
AHEADS_BASE: Integer
|
||||
AHEADS_SMALL_EN: Integer
|
||||
AHEADS_SMALL: Integer
|
||||
AHEADS_MEDIUM_EN: Integer
|
||||
AHEADS_MEDIUM: Integer
|
||||
AHEADS_LARGE_V1: Integer
|
||||
AHEADS_LARGE_V2: Integer
|
||||
AHEADS_LARGE_V3: Integer
|
||||
AHEADS_LARGE_V3_TURBO: Integer
|
||||
|
||||
def self.lang_max_id: () -> Integer
|
||||
def self.lang_id: (string name) -> Integer
|
||||
def self.lang_str: (Integer id) -> String
|
||||
def self.lang_str_full: (Integer id) -> String
|
||||
def self.log_set: (log_callback?, Object? user_data) -> log_callback
|
||||
def self.system_info_str: () -> String
|
||||
def self.log_set: (log_callback, Object? user_data) -> log_callback
|
||||
|
||||
class Context
|
||||
def self.new: (String | path | ::URI::HTTP) -> instance
|
||||
|
||||
# transcribe a single file
|
||||
# can emit to a block results
|
||||
#
|
||||
# params = Whisper::Params.new
|
||||
# params.duration = 60_000
|
||||
# whisper.transcribe "path/to/audio.wav", params do |text|
|
||||
# puts text
|
||||
# end
|
||||
#
|
||||
# If `n_processors` is greater than 1, you cannot set any callbacks including
|
||||
# new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
|
||||
# and log_callback set by Whisper.log_set
|
||||
def transcribe: (path, Whisper::Params, ?n_processors: Integer) -> self
|
||||
| (path, Whisper::Params, ?n_processors: Integer) { (String) -> void } -> self
|
||||
|
||||
def self.new: (string | _ToPath | ::URI::HTTP) -> instance
|
||||
def transcribe: (string, Params) -> self
|
||||
| (string, Params) { (String) -> void } -> self
|
||||
def model_n_vocab: () -> Integer
|
||||
def model_n_audio_ctx: () -> Integer
|
||||
def model_n_audio_state: () -> Integer
|
||||
|
|
@ -66,105 +34,22 @@ module Whisper
|
|||
def model_n_mels: () -> Integer
|
||||
def model_ftype: () -> Integer
|
||||
def model_type: () -> String
|
||||
|
||||
# Yields each Whisper::Segment:
|
||||
#
|
||||
# whisper.transcribe("path/to/audio.wav", params)
|
||||
# whisper.each_segment do |segment|
|
||||
# puts segment.text
|
||||
# end
|
||||
#
|
||||
# Returns an `Enumerator` if no block given:
|
||||
#
|
||||
# whisper.transcribe("path/to/audio.wav", params)
|
||||
# enum = whisper.each_segment
|
||||
# enum.to_a # => [#<Whisper::Segment>, ...]
|
||||
#
|
||||
def each_segment: { (Segment) -> void } -> void
|
||||
| () -> Enumerator[Segment]
|
||||
|
||||
def model: () -> Model
|
||||
def full_get_segment: (Integer nth) -> Segment
|
||||
def full_n_segments: () -> Integer
|
||||
|
||||
# Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
|
||||
#
|
||||
def full_lang_id: () -> Integer
|
||||
|
||||
# Start time of a segment indexed by `segment_index` in centiseconds (10 times milliseconds).
|
||||
#
|
||||
# full_get_segment_t0(3) # => 1668 (16680 ms)
|
||||
#
|
||||
def full_get_segment_t0: (Integer) -> Integer
|
||||
|
||||
# End time of a segment indexed by `segment_index` in centiseconds (10 times milliseconds).
|
||||
#
|
||||
# full_get_segment_t1(3) # => 1668 (16680 ms)
|
||||
#
|
||||
def full_get_segment_t1: (Integer) -> Integer
|
||||
|
||||
# Whether the next segment indexed by `segment_index` is predicated as a speaker turn.
|
||||
#
|
||||
# full_get_segment_speacker_turn_next(3) # => true
|
||||
#
|
||||
def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
|
||||
|
||||
# Text of a segment indexed by `segment_index`.
|
||||
#
|
||||
# full_get_segment_text(3) # => "ask not what your country can do for you, ..."
|
||||
#
|
||||
def full_get_segment_text: (Integer) -> String
|
||||
|
||||
def full_get_segment_no_speech_prob: (Integer) -> Float
|
||||
|
||||
# Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
# Not thread safe for same context
|
||||
# Uses the specified decoding strategy to obtain the text.
|
||||
#
|
||||
# The second argument `samples` must be an array of samples, respond to `:length`, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
|
||||
#
|
||||
def full: (Whisper::Params, Array[Float] samples, ?Integer n_samples) -> self
|
||||
| (Whisper::Params, _Samples, ?Integer n_samples) -> self
|
||||
|
||||
# Split the input audio in chunks and process each chunk separately using `whisper_full_with_state()`
|
||||
# Result is stored in the default state of the context
|
||||
# Not thread safe if executed in parallel on the same context.
|
||||
# It seems this approach can offer some speedup in some cases.
|
||||
# However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
#
|
||||
# If `n_processors` is greater than 1, you cannot set any callbacks including
|
||||
# new_segment_callback, progress_callback, encoder_begin_callback, abort_callback,
|
||||
# and log_callback set by Whisper.log_set
|
||||
def full_parallel: (Whisper::Params, Array[Float], ?Integer n_samples) -> self
|
||||
| (Whisper::Params, _Samples, ?Integer n_samples) -> self
|
||||
| (Whisper::Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
||||
|
||||
def to_srt: () -> String
|
||||
def to_webvtt: () -> String
|
||||
|
||||
class Params
|
||||
def self.new: (
|
||||
use_gpu: boolish,
|
||||
flash_attn: boolish,
|
||||
gpu_device: Integer,
|
||||
dtw_token_timestamps: boolish,
|
||||
dtw_aheads_preset: Integer,
|
||||
dtw_n_top: Integer | nil,
|
||||
) -> instance
|
||||
|
||||
def use_gpu=: (boolish) -> boolish
|
||||
def use_gpu: () -> (true | false)
|
||||
def flash_attn=: (boolish) -> boolish
|
||||
def flash_attn: () -> (true | false)
|
||||
def gpu_device=: (Integer) -> Integer
|
||||
def gpu_device: () -> Integer
|
||||
def dtw_token_timestamps=: (boolish) -> boolish
|
||||
def dtw_token_timestamps: () -> (true | false)
|
||||
def dtw_aheads_preset=: (Integer) -> Integer
|
||||
def dtw_aheads_preset: () -> Integer
|
||||
def dtw_n_top=: (Integer | nil) -> (Integer | nil)
|
||||
def dtw_n_top: () -> (Integer | nil)
|
||||
end
|
||||
def full: (Params, Array[Float] samples, ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer n_samples) -> self
|
||||
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
||||
end
|
||||
|
||||
class Params
|
||||
|
|
@ -180,10 +65,8 @@ module Whisper
|
|||
?suppress_blank: boolish,
|
||||
?suppress_nst: boolish,
|
||||
?token_timestamps: boolish,
|
||||
?max_len: Integer,
|
||||
?split_on_word: boolish,
|
||||
?initial_prompt: string | nil,
|
||||
?carry_initial_prompt: boolish,
|
||||
?diarize: boolish,
|
||||
?offset: Integer,
|
||||
?duration: Integer,
|
||||
|
|
@ -199,254 +82,76 @@ module Whisper
|
|||
?new_segment_callback_user_data: Object,
|
||||
?progress_callback: progress_callback,
|
||||
?progress_callback_user_data: Object,
|
||||
?encoder_begin_callback: encoder_begin_callback,
|
||||
?encoder_begin_callback_user_data: Object,
|
||||
?abort_callback: abort_callback,
|
||||
?abort_callback_user_data: Object,
|
||||
?vad: boolish,
|
||||
?vad_model_path: path | URI,
|
||||
?vad_params: Whisper::VAD::Params
|
||||
?abort_callback_user_data: Object
|
||||
) -> instance
|
||||
|
||||
# params.language = "auto" | "en", etc...
|
||||
#
|
||||
def language=: (String) -> String # TODO: Enumerate lang names
|
||||
|
||||
def language: () -> String
|
||||
def translate=: (boolish) -> boolish
|
||||
def translate: () -> (true | false)
|
||||
def no_context=: (boolish) -> boolish
|
||||
|
||||
# If `true`, does not use past transcription (if any) as initial prompt for the decoder.
|
||||
#
|
||||
def no_context: () -> (true | false)
|
||||
|
||||
def single_segment=: (boolish) -> boolish
|
||||
|
||||
# If `true`, forces single segment output (useful for streaming).
|
||||
#
|
||||
def single_segment: () -> (true | false)
|
||||
|
||||
def print_special=: (boolish) -> boolish
|
||||
|
||||
# If `true`, prints special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.).
|
||||
#
|
||||
def print_special: () -> (true | false)
|
||||
|
||||
def print_progress=: (boolish) -> boolish
|
||||
|
||||
# If `true`, prints progress information.
|
||||
#
|
||||
def print_progress: () -> (true | false)
|
||||
|
||||
def print_realtime=: (boolish) -> boolish
|
||||
|
||||
# If `true`, prints results from within whisper.cpp. (avoid it, use callback instead)
|
||||
#
|
||||
def print_realtime: () -> (true | false)
|
||||
|
||||
# If `true`, prints timestamps for each text segment when printing realtime.
|
||||
#
|
||||
def print_timestamps=: (boolish) -> boolish
|
||||
|
||||
def print_timestamps: () -> (true | false)
|
||||
|
||||
def suppress_blank=: (boolish) -> boolish
|
||||
|
||||
# If `true`, suppresses blank outputs.
|
||||
#
|
||||
def suppress_blank: () -> (true | false)
|
||||
|
||||
def suppress_nst=: (boolish) -> boolish
|
||||
|
||||
# If `true`, suppresses non-speech-tokens.
|
||||
#
|
||||
def suppress_nst: () -> (true | false)
|
||||
|
||||
def token_timestamps=: (boolish) -> boolish
|
||||
|
||||
# If `true`, enables token-level timestamps.
|
||||
#
|
||||
def token_timestamps: () -> (true | false)
|
||||
|
||||
def max_len=: (Integer) -> Integer
|
||||
|
||||
# max segment length in characters.
|
||||
#
|
||||
def max_len: () -> Integer
|
||||
|
||||
def split_on_word=: (boolish) -> boolish
|
||||
|
||||
# If `true`, split on word rather than on token (when used with max_len).
|
||||
#
|
||||
def split_on_word: () -> (true | false)
|
||||
|
||||
def initial_prompt=: (_ToS) -> _ToS
|
||||
def carry_initial_prompt=: (boolish) -> boolish
|
||||
|
||||
# Tokens to provide to the whisper decoder as initial prompt
|
||||
# these are prepended to any existing text context from a previous call
|
||||
# use whisper_tokenize() to convert text to tokens.
|
||||
# Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
|
||||
#
|
||||
def initial_prompt: () -> (String | nil)
|
||||
def carry_initial_prompt: () -> (true | false)
|
||||
|
||||
def diarize=: (boolish) -> boolish
|
||||
|
||||
# If `true`, enables diarization.
|
||||
#
|
||||
def diarize: () -> (true | false)
|
||||
|
||||
def offset=: (Integer) -> Integer
|
||||
|
||||
# Start offset in ms.
|
||||
#
|
||||
def offset: () -> Integer
|
||||
|
||||
def duration=: (Integer) -> Integer
|
||||
|
||||
# Audio duration to process in ms.
|
||||
#
|
||||
def duration: () -> Integer
|
||||
|
||||
def max_text_tokens=: (Integer) -> Integer
|
||||
|
||||
# Max tokens to use from past text as prompt for the decoder.
|
||||
#
|
||||
def max_text_tokens: () -> Integer
|
||||
|
||||
def temperature=: (Float) -> Float
|
||||
def temperature: () -> Float
|
||||
def max_initial_ts=: (Float) -> Float
|
||||
|
||||
# See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
||||
#
|
||||
def max_initial_ts: () -> Float
|
||||
|
||||
def length_penalty=: (Float) -> Float
|
||||
def length_penalty: () -> Float
|
||||
def temperature_inc=: (Float) -> Float
|
||||
def temperature_inc: () -> Float
|
||||
def entropy_thold=: (Float) -> Float
|
||||
|
||||
# Similar to OpenAI's "compression_ratio_threshold"
|
||||
#
|
||||
def entropy_thold: () -> Float
|
||||
|
||||
def logprob_thold=: (Float) -> Float
|
||||
def logprob_thold: () -> Float
|
||||
def no_speech_thold=: (Float) -> Float
|
||||
def no_speech_thold: () -> Float
|
||||
|
||||
# Sets new segment callback, called for every newly generated text segment.
|
||||
#
|
||||
# params.new_segment_callback = ->(context, _, n_new, user_data) {
|
||||
# # ...
|
||||
# }
|
||||
#
|
||||
def new_segment_callback=: (new_segment_callback) -> new_segment_callback
|
||||
def new_segment_callback: () -> (new_segment_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of new segment callback.
|
||||
#
|
||||
def new_segment_callback_user_data=: (Object) -> Object
|
||||
|
||||
def new_segment_callback_user_data: () -> Object
|
||||
|
||||
# Sets progress callback, called on each progress update.
|
||||
#
|
||||
# params.new_segment_callback = ->(context, _, progress, user_data) {
|
||||
# # ...
|
||||
# }
|
||||
#
|
||||
# +progress+ is an Integer between 0 and 100.
|
||||
#
|
||||
def progress_callback=: (progress_callback) -> progress_callback
|
||||
|
||||
def progress_callback: () -> (progress_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of progress callback.
|
||||
#
|
||||
def progress_callback_user_data=: (Object) -> Object
|
||||
|
||||
def progress_callback_user_data: () -> Object
|
||||
|
||||
# Sets encoder begin callback, called when the encoder starts.
|
||||
#
|
||||
def encoder_begin_callback=: (encoder_begin_callback) -> encoder_begin_callback
|
||||
|
||||
def encoder_begin_callback: () -> (encoder_begin_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of encoder begin callback.
|
||||
#
|
||||
def encoder_begin_callback_user_data=: (Object) -> Object
|
||||
|
||||
def encoder_begin_callback_user_data: () -> Object
|
||||
|
||||
# Sets abort callback, called to check if the process should be aborted.
|
||||
#
|
||||
# params.abort_callback = ->(user_data) {
|
||||
# # ...
|
||||
# }
|
||||
#
|
||||
#
|
||||
def abort_callback=: (abort_callback) -> abort_callback
|
||||
|
||||
def abort_callback: () -> (abort_callback | nil)
|
||||
|
||||
# Sets user data passed to the last argument of abort callback.
|
||||
#
|
||||
def abort_callback_user_data=: (Object) -> Object
|
||||
|
||||
def abort_callback_user_data: () -> Object
|
||||
|
||||
# Enable VAD
|
||||
#
|
||||
def vad=: (boolish) -> boolish
|
||||
|
||||
def vad: () -> (true | false)
|
||||
|
||||
# Path to the VAD model
|
||||
def vad_model_path=: (path | URI | nil) -> (path | URI | nil)
|
||||
|
||||
def vad_model_path: () -> (String | nil)
|
||||
|
||||
def vad_params=: (Whisper::VAD::Params) -> Whisper::VAD::Params
|
||||
def vad_params: () -> (Whisper::VAD::Params)
|
||||
|
||||
# Hook called on new segment. Yields each Whisper::Segment.
|
||||
#
|
||||
# whisper.on_new_segment do |segment|
|
||||
# # ...
|
||||
# end
|
||||
#
|
||||
def on_new_segment: { (Segment) -> void } -> void
|
||||
|
||||
# Hook called on progress update. Yields each progress `Integer` between 0 and 100.
|
||||
#
|
||||
def on_progress: { (Integer progress) -> void } -> void
|
||||
|
||||
# Hook called on encoder starts.
|
||||
#
|
||||
def on_encoder_begin: { () -> void } -> void
|
||||
|
||||
# Call block to determine whether abort or not. Return `true` when you want to abort.
|
||||
#
|
||||
# params.abort_on do
|
||||
# if some_condition
|
||||
# true # abort
|
||||
# else
|
||||
# false # continue
|
||||
# end
|
||||
# end
|
||||
#
|
||||
def abort_on: { (Object user_data) -> boolish } -> void
|
||||
end
|
||||
|
||||
class Model
|
||||
def self.pre_converted_models: () -> Hash[String, Model::URI]
|
||||
def self.coreml_compiled_models: () -> Hash[Model::URI, Model::ZipURI]
|
||||
def self.new: () -> instance
|
||||
def n_vocab: () -> Integer
|
||||
def n_audio_ctx: () -> Integer
|
||||
|
|
@ -462,215 +167,18 @@ module Whisper
|
|||
def type: () -> String
|
||||
|
||||
class URI
|
||||
def self.new: (string | ::URI::HTTP) -> instance
|
||||
def self.new: (string | ::URI::HTTP) -> self
|
||||
def to_path: -> String
|
||||
def clear_cache: -> void
|
||||
end
|
||||
|
||||
class ZipURI < URI
|
||||
def cache: () -> Pathname
|
||||
def clear_cache: () -> void
|
||||
end
|
||||
end
|
||||
|
||||
class Segment
|
||||
type deconstructed_keys = {
|
||||
start_time: (Integer | nil),
|
||||
end_time: (Integer | nil),
|
||||
text: (String | nil),
|
||||
no_speech_prob: (Float | nil),
|
||||
speaker_turn_next: (true | false | nil),
|
||||
n_tokens: (Integer | nil)
|
||||
}
|
||||
|
||||
# Start time in milliseconds.
|
||||
#
|
||||
def start_time: () -> Integer
|
||||
|
||||
# End time in milliseconds.
|
||||
#
|
||||
def end_time: () -> Integer
|
||||
|
||||
# Whether the next segment is predicted as a speaker turn.
|
||||
#
|
||||
def speaker_turn_next?: () -> (true | false)
|
||||
|
||||
def speaker_next_turn?: () -> (true | false)
|
||||
def text: () -> String
|
||||
def no_speech_prob: () -> Float
|
||||
|
||||
# Get number of tokens in the segment
|
||||
#
|
||||
def n_tokens: () -> Integer
|
||||
|
||||
# Yields each Whisper::Token:
|
||||
#
|
||||
# whisper.each_segment.first.each_token do |token|
|
||||
# p token
|
||||
# end
|
||||
#
|
||||
# Returns an `Enumerator` if no block is given:
|
||||
#
|
||||
# whisper.each_segment.first.each_token.to_a # => [#<Whisper::Token>, ...]
|
||||
#
|
||||
def each_token: { (Token) -> void } -> void
|
||||
| () -> Enumerator[Token]
|
||||
def to_srt_cue: () -> String
|
||||
def to_webvtt_cue: () -> String
|
||||
|
||||
|
||||
# Possible keys: `:start_time`, `:end_time`, `:text`, `:no_speech_prob`, `:speaker_turn_next`
|
||||
#
|
||||
# whisper.each_segment do |segment|
|
||||
# segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
#
|
||||
# puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
||||
# end
|
||||
def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next | :n_tokens] | nil) -> deconstructed_keys
|
||||
end
|
||||
|
||||
module Token
|
||||
type deconstructed_keys = {
|
||||
id: (Integer | nil),
|
||||
tid: (Integer | nil),
|
||||
probability: (Float | nil),
|
||||
log_probability: (Float | nil),
|
||||
pt: (Float | nil),
|
||||
ptsum: (Float | nil),
|
||||
t_dtw: (Integer | nil),
|
||||
voice_length: (Float | nil),
|
||||
text: (String | nil),
|
||||
start_time: (Integer | nil),
|
||||
end_time: (Integer | nil),
|
||||
}
|
||||
|
||||
# Token ID.
|
||||
#
|
||||
def id: () -> Integer
|
||||
|
||||
# Forced timestamp token ID.
|
||||
#
|
||||
def tid: () -> Integer
|
||||
|
||||
# Probability of the token.
|
||||
#
|
||||
def probability: () -> Float
|
||||
|
||||
# Log probability of the token.
|
||||
#
|
||||
def log_probability: () -> Float
|
||||
|
||||
# Probability of the timestamp token.
|
||||
#
|
||||
def pt: () -> Float
|
||||
|
||||
# Sum of probability of all timestamp tokens.
|
||||
#
|
||||
def ptsum: () -> Float
|
||||
|
||||
# [EXPERIMENTAL] Token-level timestamps with DTW
|
||||
#
|
||||
# Do not use if you haven't computed token-level timestamps with dtw.
|
||||
# Roughly corresponds to the moment in audio in which the token was output.
|
||||
#
|
||||
def t_dtw: () -> Integer
|
||||
|
||||
# Voice length of the token.
|
||||
#
|
||||
def voice_length: () -> Float
|
||||
|
||||
# Start time of the token.
|
||||
#
|
||||
# Token-level timestamp data.
|
||||
# Do not use if you haven't computed token-level timestamps.
|
||||
#
|
||||
def start_time: () -> Integer
|
||||
|
||||
# End time of the token.
|
||||
#
|
||||
# Token-level timestamp data.
|
||||
# Do not use if you haven't computed token-level timestamps.
|
||||
#
|
||||
def end_time: () -> Integer
|
||||
|
||||
# Get the token text of the token.
|
||||
#
|
||||
def text: () -> String
|
||||
def deconstruct_keys: (Array[:id | :tid | :probability | :log_probability | :pt | :ptsum | :t_dtw | :voice_length | :start_time | :end_time | :text] | nil) -> deconstructed_keys
|
||||
end
|
||||
|
||||
module VAD
|
||||
class Params
|
||||
def self.new: (
|
||||
?threshold: Float,
|
||||
?min_speech_duration_ms: Integer,
|
||||
?min_silence_duration_ms: Integer,
|
||||
?max_speech_duration_s: Float,
|
||||
?speech_pad_ms: Integer,
|
||||
?samples_overlap: Float
|
||||
) -> instance
|
||||
|
||||
# Probability threshold to consider as speech.
|
||||
#
|
||||
def threshold=: (Float) -> Float
|
||||
|
||||
def threshold: () -> Float
|
||||
|
||||
# Min duration for a valid speech segment.
|
||||
#
|
||||
def min_speech_duration_ms=: (Integer) -> Integer
|
||||
|
||||
def min_speech_duration_ms: () -> Integer
|
||||
|
||||
# Min silence duration to consider speech as ended.
|
||||
#
|
||||
def min_silence_duration_ms=: (Integer) -> Integer
|
||||
|
||||
def min_silence_duration_ms: () -> Integer
|
||||
|
||||
# Max duration of a speech segment before forcing a new segment.
|
||||
def max_speech_duration_s=: (Float) -> Float
|
||||
|
||||
def max_speech_duration_s: () -> Float
|
||||
|
||||
# Padding added before and after speech segments.
|
||||
#
|
||||
def speech_pad_ms=: (Integer) -> Integer
|
||||
|
||||
def speech_pad_ms: () -> Integer
|
||||
|
||||
# Overlap in seconds when copying audio samples from speech segment.
|
||||
#
|
||||
def samples_overlap=: (Float) -> Float
|
||||
|
||||
def samples_overlap: () -> Float
|
||||
def ==: (Params) -> (true | false)
|
||||
end
|
||||
|
||||
class Context
|
||||
def self.new: (String | path | ::URI::HTTP model_name_or_path) -> instance
|
||||
def segments_from_samples: (Params, Array[Float] samples, ?Integer n_samples) -> Segments
|
||||
| (Params, _Samples, ?Integer n_samples) -> Segments
|
||||
def detect: (path wav_file_path, Params) -> Segments
|
||||
end
|
||||
|
||||
class Segments
|
||||
include Enumerable[Segment]
|
||||
|
||||
def each: { (Segment) -> void } -> void
|
||||
| () -> Enumerator[Segment]
|
||||
def length: -> Integer
|
||||
end
|
||||
|
||||
class Segment
|
||||
type deconstructed_keys = {
|
||||
start_time: (Integer | nil),
|
||||
end_time: (Integer | nil),
|
||||
}
|
||||
|
||||
def start_time: () -> Integer
|
||||
def end_time: () -> Integer
|
||||
def deconstruct_keys: (Array[:start_time | :end_time] | nil) -> deconstructed_keys
|
||||
end
|
||||
end
|
||||
|
||||
class Error < StandardError
|
||||
|
|
|
|||
|
|
@ -1,82 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestContextParams < TestBase
|
||||
PARAM_NAMES = [
|
||||
:use_gpu,
|
||||
:flash_attn,
|
||||
:gpu_device,
|
||||
:dtw_token_timestamps,
|
||||
:dtw_aheads_preset,
|
||||
:dtw_n_top
|
||||
]
|
||||
|
||||
def test_new
|
||||
params = Whisper::Context::Params.new
|
||||
assert_instance_of Whisper::Context::Params, params
|
||||
end
|
||||
|
||||
def test_attributes
|
||||
params = Whisper::Context::Params.new
|
||||
|
||||
assert_true params.use_gpu
|
||||
params.use_gpu = false
|
||||
assert_false params.use_gpu
|
||||
|
||||
assert_true params.flash_attn
|
||||
params.flash_attn = false
|
||||
assert_false params.flash_attn
|
||||
|
||||
assert_equal 0, params.gpu_device
|
||||
params.gpu_device = 1
|
||||
assert_equal 1, params.gpu_device
|
||||
|
||||
assert_false params.dtw_token_timestamps
|
||||
params.dtw_token_timestamps = true
|
||||
assert_true params.dtw_token_timestamps
|
||||
|
||||
assert_equal Whisper::AHEADS_NONE, params.dtw_aheads_preset
|
||||
params.dtw_aheads_preset =Whisper::AHEADS_BASE
|
||||
assert_equal Whisper::AHEADS_BASE, params.dtw_aheads_preset
|
||||
|
||||
assert_nil params.dtw_n_top
|
||||
params.dtw_n_top = 6
|
||||
assert_equal 6, params.dtw_n_top
|
||||
params.dtw_n_top = nil
|
||||
assert_nil params.dtw_n_top
|
||||
end
|
||||
|
||||
def test_new_with_kw_args
|
||||
params = Whisper::Context::Params.new(use_gpu: false)
|
||||
assert_false params.use_gpu
|
||||
end
|
||||
|
||||
def test_new_with_kw_wargs_non_existent
|
||||
assert_raise ArgumentError do
|
||||
Whisper::Context::Params.new(non_existent: "value")
|
||||
end
|
||||
end
|
||||
|
||||
data(PARAM_NAMES.collect {|param| [param, param]}.to_h)
|
||||
def test_new_with_kw_args_default_values(param)
|
||||
default_params = Whisper::Context::Params.new
|
||||
default_value = default_params.send(param)
|
||||
value = if param == :dtw_n_top
|
||||
6
|
||||
else
|
||||
case default_value
|
||||
in true | false
|
||||
!default_value
|
||||
in Integer
|
||||
default_value + 1
|
||||
end
|
||||
end
|
||||
params = Whisper::Context::Params.new(param => value)
|
||||
assert_equal value, params.send(param)
|
||||
|
||||
PARAM_NAMES.reject {|name| name == param}.each do |name|
|
||||
expected = default_params.send(name)
|
||||
actual = params.send(name)
|
||||
assert_equal expected, actual
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
require_relative "helper"
|
||||
require 'tempfile'
|
||||
require 'tmpdir'
|
||||
require 'open3'
|
||||
|
||||
class TestPackage < TestBase
|
||||
def test_build
|
||||
Tempfile.create do |file|
|
||||
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path, exception: true)
|
||||
assert file.size > 0
|
||||
assert_path_exist file.to_path
|
||||
end
|
||||
end
|
||||
|
||||
sub_test_case "Building binary on installation" do
|
||||
def setup
|
||||
system "rake", "build", exception: true
|
||||
end
|
||||
|
||||
def test_install
|
||||
gemspec = Gem::Specification.load("whispercpp.gemspec")
|
||||
Dir.mktmpdir do |dir|
|
||||
system "gem", "install", "--install-dir", dir, "--no-document", File.join("pkg", gemspec.file_name), exception: true
|
||||
assert_installed dir, gemspec.version
|
||||
end
|
||||
end
|
||||
|
||||
def test_install_with_coreml
|
||||
omit_unless RUBY_PLATFORM.match?(/darwin/) do
|
||||
gemspec = Gem::Specification.load("whispercpp.gemspec")
|
||||
Dir.mktmpdir do |dir|
|
||||
system "gem", "install", "--install-dir", dir, "--no-document", File.join("pkg", gemspec.file_name), "--", "--enable-whisper-coreml", exception: true
|
||||
assert_installed dir, gemspec.version
|
||||
libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib")
|
||||
assert_nothing_raised do
|
||||
system "ruby", "-I", libdir, "-r", "whisper", "-e", "Whisper::Context.new('tiny')", exception: true
|
||||
end
|
||||
output, status = Open3.capture2("ruby", "-I", libdir, "-r", "whisper", "-e", "puts Whisper.system_info_str")
|
||||
assert_match /COREML = 1/, output
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def assert_installed(dir, version)
|
||||
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", "whisper.#{RbConfig::CONFIG["DLEXT"]}")
|
||||
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/LICENSE")
|
||||
assert_path_not_exist File.join(dir, "gems/whispercpp-#{version}/ext/build")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,145 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestSegment < TestBase
|
||||
def test_iteration
|
||||
whisper.each_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
end
|
||||
end
|
||||
|
||||
def test_enumerator
|
||||
enum = whisper.each_segment
|
||||
assert_instance_of Enumerator, enum
|
||||
enum.to_a.each_with_index do |segment, index|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
assert_kind_of Integer, index
|
||||
end
|
||||
end
|
||||
|
||||
def test_start_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal 0, segment.start_time if i == 0
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_end_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_no_speech_prob
|
||||
no_speech_prob = nil
|
||||
whisper.each_segment do |segment|
|
||||
no_speech_prob = segment.no_speech_prob
|
||||
end
|
||||
assert no_speech_prob > 0.0
|
||||
end
|
||||
|
||||
def test_on_new_segment
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
index = 0
|
||||
params.on_new_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
if index == 0
|
||||
seg = segment
|
||||
assert_equal 0, segment.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
end
|
||||
index += 1
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
assert_equal 0, seg.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
|
||||
end
|
||||
|
||||
def test_on_new_segment_twice
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
params.on_new_segment do |segment|
|
||||
seg = segment
|
||||
return
|
||||
end
|
||||
params.on_new_segment do |segment|
|
||||
assert_same seg, segment
|
||||
return
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
end
|
||||
|
||||
def test_transcription_after_segment_retrieved
|
||||
segment = whisper.each_segment.first
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
|
||||
whisper.transcribe(AUDIO, Whisper::Params.new(offset: 5000))
|
||||
assert_not_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
assert_match(/what you can do for your country/i, segment.text)
|
||||
end
|
||||
|
||||
def test_pattern_matching
|
||||
segment = whisper.each_segment.first
|
||||
segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
|
||||
assert_equal segment.start_time, start_time
|
||||
assert_equal segment.end_time, end_time
|
||||
assert_equal segment.text, text
|
||||
assert_equal segment.no_speech_prob, no_speech_prob
|
||||
assert_equal segment.speaker_turn_next?, speaker_turn_next
|
||||
end
|
||||
|
||||
def test_pattern_matching_partial
|
||||
segment = whisper.each_segment.first
|
||||
segment => {start_time:, end_time:, text:}
|
||||
|
||||
assert_equal segment.start_time, start_time
|
||||
assert_equal segment.end_time, end_time
|
||||
assert_equal segment.text, text
|
||||
end
|
||||
|
||||
def test_deconstruct_keys
|
||||
segment = whisper.each_segment.first
|
||||
expected = {
|
||||
start_time: segment.start_time,
|
||||
end_time: segment.end_time,
|
||||
text: segment.text,
|
||||
no_speech_prob: segment.no_speech_prob,
|
||||
speaker_turn_next: segment.speaker_turn_next?
|
||||
}
|
||||
assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_non_existent
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
assert_equal({}, segment.deconstruct_keys([:non_existent]))
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_too_many_keys
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_includes_non_existent_keys_not_too_many
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
expected = {
|
||||
start_time: segment.start_time,
|
||||
end_time: segment.end_time,
|
||||
text: segment.text,
|
||||
no_speech_prob: segment.no_speech_prob
|
||||
}
|
||||
assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
|
||||
end
|
||||
end
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestToken < TestBase
|
||||
def setup
|
||||
@segment = whisper.each_segment.first
|
||||
@token = @segment.each_token.first
|
||||
end
|
||||
|
||||
def test_n_tokens
|
||||
assert_equal 27, @segment.n_tokens
|
||||
end
|
||||
|
||||
def test_allocate
|
||||
token = Whisper::Token.allocate
|
||||
assert_raise do
|
||||
token.id
|
||||
end
|
||||
end
|
||||
|
||||
def test_each_token
|
||||
i = 0
|
||||
@segment.each_token do |token|
|
||||
i += 1
|
||||
assert_instance_of Whisper::Token, token
|
||||
end
|
||||
assert_equal 27, i
|
||||
end
|
||||
|
||||
def test_each_token_without_block
|
||||
assert_instance_of Enumerator, @segment.each_token
|
||||
end
|
||||
|
||||
def test_token
|
||||
assert_instance_of Whisper::Token, @token
|
||||
|
||||
assert_instance_of Integer, @token.id
|
||||
assert_instance_of Float, @token.probability
|
||||
assert_instance_of Float, @token.log_probability
|
||||
|
||||
assert_instance_of Integer, @token.tid
|
||||
assert_instance_of Float, @token.pt
|
||||
assert_instance_of Float, @token.ptsum
|
||||
|
||||
assert_instance_of Integer, @token.start_time
|
||||
assert_instance_of Integer, @token.end_time
|
||||
|
||||
assert_instance_of Integer, @token.t_dtw
|
||||
|
||||
assert_instance_of Float, @token.voice_length
|
||||
|
||||
assert_instance_of String, @token.text
|
||||
end
|
||||
|
||||
def test_text
|
||||
assert_equal ["[_BEG_]", " And", " so", " my", " fellow", " Americans", ",", " ask", " not", " what", " your", " country", " can", " do", " for", " you", ",", " ask", " what", " you", " can", " do", " for", " your", " country", ".", "[_TT_550]"],
|
||||
@segment.each_token.collect(&:text)
|
||||
end
|
||||
|
||||
def test_token_timestamps
|
||||
params = Whisper::Params.new(token_timestamps: true)
|
||||
whisper.transcribe(TestBase::AUDIO, params)
|
||||
prev = -1
|
||||
whisper.each_segment.first.each_token do |token|
|
||||
assert token.start_time >= prev
|
||||
assert token.end_time >= token.start_time
|
||||
prev = token.end_time
|
||||
end
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_with_nil
|
||||
keys = %i[id tid probability log_probability pt ptsum t_dtw voice_length start_time end_time text]
|
||||
expected = keys.collect {|key| [key, @token.send(key)] }.to_h
|
||||
assert_equal(expected, @token.deconstruct_keys(nil))
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_with_keys
|
||||
keys = %i[id tid probability log_probability pt ptsum t_dtw voice_length start_time end_time text]
|
||||
expected = keys.collect {|key| [key, @token.send(key)] }.to_h
|
||||
assert_equal expected, @token.deconstruct_keys(keys)
|
||||
end
|
||||
end
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestVAD < TestBase
|
||||
def setup
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
vad_params = Whisper::VAD::Params.new
|
||||
@params = Whisper::Params.new(
|
||||
vad: true,
|
||||
vad_model_path: "silero-v6.2.0",
|
||||
vad_params:
|
||||
)
|
||||
end
|
||||
|
||||
def test_transcribe
|
||||
@whisper.transcribe(TestBase::AUDIO, @params) do |text|
|
||||
assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestVADContext < TestBase
|
||||
def test_initialize
|
||||
context = Whisper::VAD::Context.new("silero-v6.2.0")
|
||||
assert_instance_of Whisper::VAD::Context, context
|
||||
end
|
||||
|
||||
def test_detect
|
||||
context = Whisper::VAD::Context.new("silero-v6.2.0")
|
||||
segments = context.detect(AUDIO, Whisper::VAD::Params.new)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_invalid_model_type
|
||||
assert_raise TypeError do
|
||||
Whisper::VAD::Context.new(Object.new)
|
||||
end
|
||||
end
|
||||
|
||||
def test_allocate
|
||||
vad = Whisper::VAD::Context.allocate
|
||||
assert_raise do
|
||||
vad.detect(AUDIO, Whisper::VAD::Params.new)
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def assert_segments(segments)
|
||||
assert_instance_of Whisper::VAD::Segments, segments
|
||||
|
||||
i = 0
|
||||
segments.each do |segment|
|
||||
i += 1
|
||||
assert_instance_of Whisper::VAD::Segment, segment
|
||||
end
|
||||
assert i > 0
|
||||
|
||||
segments.each_with_index do |segment, index|
|
||||
assert_instance_of Integer, index
|
||||
end
|
||||
|
||||
assert_instance_of Enumerator, segments.each
|
||||
|
||||
segment = segments.each.first
|
||||
assert_instance_of Float, segment.start_time
|
||||
assert_instance_of Float, segment.end_time
|
||||
|
||||
segment => {start_time:, end_time:}
|
||||
assert_equal segment.start_time, start_time
|
||||
assert_equal segment.end_time, end_time
|
||||
|
||||
assert_equal 4, segments.length
|
||||
end
|
||||
|
||||
sub_test_case "from samples" do
|
||||
def setup
|
||||
super
|
||||
@vad = Whisper::VAD::Context.new("silero-v6.2.0")
|
||||
@samples = File.read(AUDIO, nil, 78).unpack("s<*").collect {|i| i.to_f / 2**15}
|
||||
end
|
||||
|
||||
def test_segments_from_samples
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, @samples, @samples.length)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_segments_from_samples_without_length
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, @samples)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_segments_from_samples_enumerator
|
||||
samples = @samples.each
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, samples, @samples.length)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_segments_from_samples_enumerator_without_length
|
||||
samples = @samples.each
|
||||
assert_raise ArgumentError do
|
||||
@vad.segments_from_samples(Whisper::VAD::Params.new, samples)
|
||||
end
|
||||
end
|
||||
|
||||
def test_segments_from_samples_enumerator_with_too_large_length
|
||||
samples = @samples.each.take(10).to_enum
|
||||
assert_raise StopIteration do
|
||||
@vad.segments_from_samples(Whisper::VAD::Params.new, samples, 11)
|
||||
end
|
||||
end
|
||||
|
||||
def test_segments_from_samples_with_memory_view
|
||||
samples = JFKReader.new(AUDIO)
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, samples)
|
||||
assert_segments segments
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestVADParams < TestBase
|
||||
PARAM_NAMES = [
|
||||
:threshold,
|
||||
:min_speech_duration_ms,
|
||||
:min_silence_duration_ms,
|
||||
:max_speech_duration_s,
|
||||
:speech_pad_ms,
|
||||
:samples_overlap
|
||||
]
|
||||
|
||||
def setup
|
||||
@params = Whisper::VAD::Params.new
|
||||
end
|
||||
|
||||
def test_new
|
||||
params = Whisper::VAD::Params.new
|
||||
assert_kind_of Whisper::VAD::Params, params
|
||||
end
|
||||
|
||||
def test_threshold
|
||||
assert_in_delta @params.threshold, 0.5
|
||||
@params.threshold = 0.7
|
||||
assert_in_delta @params.threshold, 0.7
|
||||
end
|
||||
|
||||
def test_min_speech_duration
|
||||
pend
|
||||
end
|
||||
|
||||
def test_min_speech_duration_ms
|
||||
assert_equal 250, @params.min_speech_duration_ms
|
||||
@params.min_speech_duration_ms = 500
|
||||
assert_equal 500, @params.min_speech_duration_ms
|
||||
end
|
||||
|
||||
def test_min_silence_duration_ms
|
||||
assert_equal 100, @params.min_silence_duration_ms
|
||||
@params.min_silence_duration_ms = 200
|
||||
assert_equal 200, @params.min_silence_duration_ms
|
||||
end
|
||||
|
||||
def test_max_speech_duration
|
||||
pend
|
||||
end
|
||||
|
||||
def test_max_speech_duration_s
|
||||
assert @params.max_speech_duration_s >= 10e37 # Defaults to FLT_MAX
|
||||
@params.max_speech_duration_s = 60.0
|
||||
assert_equal 60.0, @params.max_speech_duration_s
|
||||
end
|
||||
|
||||
def test_speech_pad_ms
|
||||
assert_equal 30, @params.speech_pad_ms
|
||||
@params.speech_pad_ms = 50
|
||||
assert_equal 50, @params.speech_pad_ms
|
||||
end
|
||||
|
||||
def test_samples_overlap
|
||||
assert_in_delta @params.samples_overlap, 0.1
|
||||
@params.samples_overlap = 0.5
|
||||
assert_in_delta @params.samples_overlap, 0.5
|
||||
end
|
||||
|
||||
def test_equal
|
||||
assert_equal @params, Whisper::VAD::Params.new
|
||||
end
|
||||
|
||||
def test_new_with_kw_args
|
||||
params = Whisper::VAD::Params.new(threshold: 0.7)
|
||||
assert_in_delta params.threshold, 0.7
|
||||
assert_equal 250, params.min_speech_duration_ms
|
||||
end
|
||||
|
||||
def test_new_with_kw_args_non_existent
|
||||
assert_raise ArgumentError do
|
||||
Whisper::VAD::Params.new(non_existent: "value")
|
||||
end
|
||||
end
|
||||
|
||||
data(PARAM_NAMES.collect {|param| [param, param]}.to_h)
|
||||
def test_new_with_kw_args_default_values(param)
|
||||
default_value = @params.send(param)
|
||||
value = default_value + 1
|
||||
params = Whisper::VAD::Params.new(param => value)
|
||||
if Float === value
|
||||
assert_in_delta value, params.send(param)
|
||||
else
|
||||
assert_equal value, params.send(param)
|
||||
end
|
||||
|
||||
PARAM_NAMES.reject {|name| name == param}.each do |name|
|
||||
expected = @params.send(name)
|
||||
actual = params.send(name)
|
||||
if Float === expected
|
||||
assert_in_delta expected, actual
|
||||
else
|
||||
assert_equal expected, actual
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestVADSegment < TestBase
|
||||
def test_initialize
|
||||
segment = Whisper::VAD::Segment.new
|
||||
|
||||
assert_raise do
|
||||
segment.start_time
|
||||
end
|
||||
|
||||
assert_raise do
|
||||
segments.end_time
|
||||
end
|
||||
|
||||
assert_raise do
|
||||
segment => {start_time:, end_time:}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestVADSegments < TestBase
|
||||
def test_initialize
|
||||
segments = Whisper::VAD::Segments.new
|
||||
|
||||
assert_raise do
|
||||
segments.each do |segment|
|
||||
end
|
||||
end
|
||||
|
||||
assert_raise do
|
||||
segments.length
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -3,12 +3,12 @@ require "whisper"
|
|||
require_relative "jfk_reader/jfk_reader"
|
||||
|
||||
class TestBase < Test::Unit::TestCase
|
||||
AUDIO = File.join(__dir__, "fixtures", "jfk.wav")
|
||||
AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
|
||||
|
||||
class << self
|
||||
def whisper
|
||||
return @whisper if @whisper
|
||||
attr_reader :whisper
|
||||
|
||||
def startup
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
params.print_timestamps = false
|
||||
|
|
@ -111,48 +111,6 @@ class TestCallback < TestBase
|
|||
assert_equal 100, last
|
||||
end
|
||||
|
||||
def test_encoder_begin_callback
|
||||
i = 0
|
||||
@params.encoder_begin_callback = ->(context, state, user_data) {
|
||||
i += 1
|
||||
}
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert i > 0
|
||||
end
|
||||
|
||||
def test_encoder_begin_callback_abort
|
||||
logs = []
|
||||
Whisper.log_set -> (level, buffer, user_data) {
|
||||
logs << buffer if level == Whisper::LOG_LEVEL_ERROR
|
||||
}, logs
|
||||
@params.encoder_begin_callback = ->(context, state, user_data) {
|
||||
return false
|
||||
}
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert_match(/encoder_begin_callback returned false - aborting/, logs.join)
|
||||
Whisper.log_set ->(level, buffer, user_data) {}, nil
|
||||
end
|
||||
|
||||
def test_encoder_begin_callback_user_data
|
||||
udata = Object.new
|
||||
@params.encoder_begin_callback_user_data = udata
|
||||
yielded = nil
|
||||
@params.encoder_begin_callback = ->(context, state, user_data) {
|
||||
yielded = user_data
|
||||
}
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert_same udata, yielded
|
||||
end
|
||||
|
||||
def test_on_encoder_begin
|
||||
i = 0
|
||||
@params.on_encoder_begin do
|
||||
i += 1
|
||||
end
|
||||
@whisper.transcribe(@audio, @params)
|
||||
assert i > 0
|
||||
end
|
||||
|
||||
def test_abort_callback
|
||||
i = 0
|
||||
@params.abort_callback = ->(user_data) {
|
||||
|
|
@ -106,13 +106,4 @@ class TestModel < TestBase
|
|||
assert_equal 1, model.ftype
|
||||
assert_equal "base", model.type
|
||||
end
|
||||
|
||||
def test_coreml_model_auto_download
|
||||
uri = Whisper::Model.coreml_compiled_models[Whisper::Model.pre_converted_models["tiny"]]
|
||||
model_path = Pathname(uri.to_path).sub_ext("")
|
||||
model_path.rmtree if model_path.exist?
|
||||
|
||||
uri.cache
|
||||
assert_path_exist model_path
|
||||
end
|
||||
end
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
require_relative "helper"
|
||||
require 'tempfile'
|
||||
require 'tmpdir'
|
||||
require 'shellwords'
|
||||
|
||||
class TestPackage < TestBase
|
||||
def test_build
|
||||
Tempfile.create do |file|
|
||||
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
|
||||
assert file.size > 0
|
||||
assert_path_exist file.to_path
|
||||
end
|
||||
end
|
||||
|
||||
sub_test_case "Building binary on installation" do
|
||||
def setup
|
||||
system "rake", "build", exception: true
|
||||
end
|
||||
|
||||
def test_install
|
||||
match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
|
||||
filename = match_data[1]
|
||||
version = match_data[2]
|
||||
basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
|
||||
Dir.mktmpdir do |dir|
|
||||
system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{filename.shellescape}", exception: true
|
||||
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", basename)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -13,10 +13,8 @@ class TestParams < TestBase
|
|||
:suppress_blank,
|
||||
:suppress_nst,
|
||||
:token_timestamps,
|
||||
:max_len,
|
||||
:split_on_word,
|
||||
:initial_prompt,
|
||||
:carry_initial_prompt,
|
||||
:diarize,
|
||||
:offset,
|
||||
:duration,
|
||||
|
|
@ -34,9 +32,6 @@ class TestParams < TestBase
|
|||
:progress_callback_user_data,
|
||||
:abort_callback,
|
||||
:abort_callback_user_data,
|
||||
:vad,
|
||||
:vad_model_path,
|
||||
:vad_params,
|
||||
]
|
||||
|
||||
def setup
|
||||
|
|
@ -46,8 +41,6 @@ class TestParams < TestBase
|
|||
def test_language
|
||||
@params.language = "en"
|
||||
assert_equal @params.language, "en"
|
||||
GC.compact
|
||||
assert_equal @params.language, "en"
|
||||
@params.language = "auto"
|
||||
assert_equal @params.language, "auto"
|
||||
end
|
||||
|
|
@ -122,13 +115,6 @@ class TestParams < TestBase
|
|||
assert !@params.print_timestamps
|
||||
end
|
||||
|
||||
def test_carry_initial_prompt
|
||||
@params.carry_initial_prompt = true
|
||||
assert @params.carry_initial_prompt
|
||||
@params.carry_initial_prompt = false
|
||||
assert !@params.carry_initial_prompt
|
||||
end
|
||||
|
||||
def test_suppress_blank
|
||||
@params.suppress_blank = true
|
||||
assert @params.suppress_blank
|
||||
|
|
@ -150,13 +136,6 @@ class TestParams < TestBase
|
|||
assert !@params.token_timestamps
|
||||
end
|
||||
|
||||
def test_max_len
|
||||
@params.max_len = 42
|
||||
assert_equal @params.max_len, 42
|
||||
@params.max_len = 0
|
||||
assert_equal @params.max_len, 0
|
||||
end
|
||||
|
||||
def test_split_on_word
|
||||
@params.split_on_word = true
|
||||
assert @params.split_on_word
|
||||
|
|
@ -212,50 +191,6 @@ class TestParams < TestBase
|
|||
assert_in_delta 0.2, @params.no_speech_thold
|
||||
end
|
||||
|
||||
def test_vad
|
||||
assert_false @params.vad
|
||||
@params.vad = true
|
||||
assert_true @params.vad
|
||||
end
|
||||
|
||||
def test_vad_model_path
|
||||
assert_nil @params.vad_model_path
|
||||
@params.vad_model_path = "silero-v6.2.0"
|
||||
assert_equal Whisper::Model.pre_converted_models["silero-v6.2.0"].to_path, @params.vad_model_path
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_nil
|
||||
@params.vad_model_path = "silero-v6.2.0"
|
||||
@params.vad_model_path = nil
|
||||
assert_nil @params.vad_model_path
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_invalid
|
||||
assert_raise TypeError do
|
||||
@params.vad_model_path = Object.new
|
||||
end
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_URI_string
|
||||
@params.vad_model_path = "https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin"
|
||||
assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v6.2.0"].to_path
|
||||
end
|
||||
|
||||
def test_vad_model_path_with_URI
|
||||
@params.vad_model_path = URI("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin")
|
||||
assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v6.2.0"].to_path
|
||||
end
|
||||
|
||||
def test_vad_params
|
||||
assert_kind_of Whisper::VAD::Params, @params.vad_params
|
||||
default_params = @params.vad_params
|
||||
assert_same default_params, @params.vad_params
|
||||
assert_equal 0.5, default_params.threshold
|
||||
new_params = Whisper::VAD::Params.new
|
||||
@params.vad_params = new_params
|
||||
assert_same new_params, @params.vad_params
|
||||
end
|
||||
|
||||
def test_new_with_kw_args
|
||||
params = Whisper::Params.new(language: "es")
|
||||
assert_equal "es", params.language
|
||||
|
|
@ -290,10 +225,6 @@ class TestParams < TestBase
|
|||
proc {}
|
||||
in [/_user_data\Z/, *]
|
||||
Object.new
|
||||
in [:vad_model_path, *]
|
||||
Whisper::Model.pre_converted_models["silero-v6.2.0"].to_path
|
||||
in [:vad_params, *]
|
||||
Whisper::VAD::Params.new
|
||||
end
|
||||
params = Whisper::Params.new(param => value)
|
||||
if Float === value
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestSegment < TestBase
|
||||
def test_iteration
|
||||
whisper.each_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
end
|
||||
end
|
||||
|
||||
def test_enumerator
|
||||
enum = whisper.each_segment
|
||||
assert_instance_of Enumerator, enum
|
||||
enum.to_a.each_with_index do |segment, index|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
assert_kind_of Integer, index
|
||||
end
|
||||
end
|
||||
|
||||
def test_start_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal 0, segment.start_time if i == 0
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_end_time
|
||||
i = 0
|
||||
whisper.each_segment do |segment|
|
||||
assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
|
||||
def test_no_speech_prob
|
||||
no_speech_prob = nil
|
||||
whisper.each_segment do |segment|
|
||||
no_speech_prob = segment.no_speech_prob
|
||||
end
|
||||
assert no_speech_prob > 0.0
|
||||
end
|
||||
|
||||
def test_on_new_segment
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
index = 0
|
||||
params.on_new_segment do |segment|
|
||||
assert_instance_of Whisper::Segment, segment
|
||||
if index == 0
|
||||
seg = segment
|
||||
assert_equal 0, segment.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
end
|
||||
index += 1
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
assert_equal 0, seg.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
|
||||
end
|
||||
|
||||
def test_on_new_segment_twice
|
||||
params = Whisper::Params.new
|
||||
seg = nil
|
||||
params.on_new_segment do |segment|
|
||||
seg = segment
|
||||
return
|
||||
end
|
||||
params.on_new_segment do |segment|
|
||||
assert_same seg, segment
|
||||
return
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
end
|
||||
end
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
require_relative "helper"
|
||||
require "stringio"
|
||||
require "etc"
|
||||
require "pathname"
|
||||
|
||||
# Exists to detect memory-related bug
|
||||
Whisper.log_set ->(level, buffer, user_data) {}, nil
|
||||
|
|
@ -21,44 +20,6 @@ class TestWhisper < TestBase
|
|||
}
|
||||
end
|
||||
|
||||
def test_whisper_pathname
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
||||
@whisper.transcribe(Pathname(AUDIO), params) {|text|
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
|
||||
}
|
||||
end
|
||||
|
||||
def test_transcribe_non_parallel
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
||||
@whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
|
||||
}
|
||||
end
|
||||
|
||||
def test_transcribe_n_processors
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
||||
without_log_callback do
|
||||
@whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
|
||||
assert_match(/what you can do for your country/i, text)
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def without_log_callback
|
||||
Whisper.log_set nil, nil
|
||||
yield
|
||||
ensure
|
||||
Whisper.log_set ->(level, buffer, user_data) {}, nil
|
||||
end
|
||||
|
||||
sub_test_case "After transcription" do
|
||||
def test_full_n_segments
|
||||
assert_equal 1, whisper.full_n_segments
|
||||
|
|
@ -133,14 +94,6 @@ class TestWhisper < TestBase
|
|||
end
|
||||
end
|
||||
|
||||
def test_system_info_str
|
||||
assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
|
||||
end
|
||||
|
||||
def test_version
|
||||
assert_kind_of String, Whisper::VERSION
|
||||
end
|
||||
|
||||
def test_log_set
|
||||
user_data = Object.new
|
||||
logs = []
|
||||
|
|
@ -170,13 +123,6 @@ class TestWhisper < TestBase
|
|||
$stderr = stderr
|
||||
end
|
||||
|
||||
def test_access_attribute_without_initialization
|
||||
whisper = Whisper::Context.allocate
|
||||
assert_raise do
|
||||
whisper.model_type
|
||||
end
|
||||
end
|
||||
|
||||
sub_test_case "full" do
|
||||
def setup
|
||||
super
|
||||
|
|
@ -228,21 +174,9 @@ class TestWhisper < TestBase
|
|||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
end
|
||||
|
||||
def test_full_with_memroy_view_gc
|
||||
samples = JFKReader.new(AUDIO)
|
||||
@whisper.full(@params, samples)
|
||||
GC.start
|
||||
require "fiddle"
|
||||
Fiddle::MemoryView.export samples do |view|
|
||||
assert_equal 176000, view.to_s.unpack("#{view.format}*").length
|
||||
end
|
||||
end
|
||||
|
||||
def test_full_parallel
|
||||
nprocessors = 2
|
||||
without_log_callback do
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||
end
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
|
|
@ -253,9 +187,7 @@ class TestWhisper < TestBase
|
|||
def test_full_parallel_with_memory_view
|
||||
nprocessors = 2
|
||||
samples = JFKReader.new(AUDIO)
|
||||
without_log_callback do
|
||||
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
||||
end
|
||||
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
|
|
@ -274,9 +206,7 @@ class TestWhisper < TestBase
|
|||
|
||||
def test_full_parallel_without_length
|
||||
nprocessors = 2
|
||||
without_log_callback do
|
||||
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
||||
end
|
||||
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
|
|
@ -293,48 +223,4 @@ class TestWhisper < TestBase
|
|||
assert_match(/for your country/i, text)
|
||||
end
|
||||
end
|
||||
|
||||
def test_to_srt
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
whisper.transcribe AUDIO, @params
|
||||
|
||||
lines = whisper.to_srt.lines
|
||||
assert_match(/\A\d+\n/, lines[0])
|
||||
assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
|
||||
end
|
||||
|
||||
def test_to_webvtt
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
whisper.transcribe AUDIO, @params
|
||||
|
||||
lines = whisper.to_webvtt.lines
|
||||
assert_equal "WEBVTT\n", lines[0]
|
||||
assert_equal "\n", lines[1]
|
||||
assert_match(/\A\d+\n/, lines[2])
|
||||
assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
|
||||
end
|
||||
|
||||
sub_test_case "Format needs escape" do
|
||||
def setup
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
@whisper.transcribe AUDIO, Whisper::Params.new
|
||||
segment = @whisper.each_segment.first
|
||||
segment.define_singleton_method :text do
|
||||
"& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
|
||||
end
|
||||
@whisper.define_singleton_method :each_segment do
|
||||
Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
|
||||
end
|
||||
end
|
||||
|
||||
def test_to_srt_escape
|
||||
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
|
||||
end
|
||||
|
||||
def test_to_webvtt_escape
|
||||
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -3,7 +3,8 @@ require_relative "extsources"
|
|||
Gem::Specification.new do |s|
|
||||
s.name = "whispercpp"
|
||||
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
||||
s.version = '1.3.7'
|
||||
s.version = '1.3.1'
|
||||
s.date = '2024-12-19'
|
||||
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
||||
s.email = 'todd.fisher@gmail.com'
|
||||
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
||||
|
|
@ -14,19 +15,18 @@ Gem::Specification.new do |s|
|
|||
if s.extra_rdoc_files.include?(basename)
|
||||
basename
|
||||
else
|
||||
file.sub("../..", "ext/sources")
|
||||
.sub("../javascript", "ext/sources/bindings/javascript")
|
||||
file.sub("../..", "ext")
|
||||
end
|
||||
}
|
||||
|
||||
s.summary = %q{Ruby whisper.cpp bindings}
|
||||
s.test_files = s.files.select {|file| file.start_with? "test/"}
|
||||
s.test_files = s.files.select {|file| file.start_with? "tests/"}
|
||||
|
||||
s.extensions << 'ext/extconf.rb'
|
||||
s.required_ruby_version = '>= 3.1.0'
|
||||
|
||||
#### Documentation and testing.
|
||||
s.homepage = 'https://github.com/ggml-org/whisper.cpp'
|
||||
s.homepage = 'https://github.com/ggerganov/whisper.cpp'
|
||||
s.rdoc_options = ['--main', 'README.md']
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -15,7 +15,6 @@ GGML_METAL_EMBED_LIBRARY=ON
|
|||
GGML_BLAS_DEFAULT=ON
|
||||
GGML_METAL_USE_BF16=ON
|
||||
GGML_OPENMP=OFF
|
||||
BUILD_STATIC_XCFRAMEWORK=${BUILD_STATIC_XCFRAMEWORK:-OFF}
|
||||
|
||||
COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
|
||||
|
|
@ -42,11 +41,6 @@ COMMON_CMAKE_ARGS=(
|
|||
-DGGML_OPENMP=${GGML_OPENMP}
|
||||
)
|
||||
|
||||
XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
|
||||
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
|
||||
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
|
||||
echo "Detected Xcode version: $XCODE_VERSION"
|
||||
|
||||
check_required_tool() {
|
||||
local tool=$1
|
||||
local install_message=$2
|
||||
|
|
@ -328,15 +322,6 @@ combine_static_libraries() {
|
|||
arch_flags+=" -arch $arch"
|
||||
done
|
||||
|
||||
|
||||
if [[ "${BUILD_STATIC_XCFRAMEWORK}" == "ON" ]]; then
|
||||
echo "Packaging static framework for ${platform}."
|
||||
mkdir -p "$(dirname "${base_dir}/${output_lib}")"
|
||||
cp "${temp_dir}/combined.a" "${base_dir}/${output_lib}"
|
||||
rm -rf "${temp_dir}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Create dynamic library
|
||||
echo "Creating dynamic library for ${platform}."
|
||||
xcrun -sdk $sdk clang++ -dynamiclib \
|
||||
|
|
@ -350,28 +335,21 @@ combine_static_libraries() {
|
|||
|
||||
# Platform-specific post-processing for device builds
|
||||
if [[ "$is_simulator" == "false" ]]; then
|
||||
if command -v xcrun vtool &>/dev/null; then
|
||||
if command -v vtool &>/dev/null; then
|
||||
case "$platform" in
|
||||
"ios")
|
||||
echo "Marking binary as a framework binary for iOS..."
|
||||
xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"visionos")
|
||||
echo "Marking binary as a framework binary for visionOS..."
|
||||
if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
|
||||
echo "Xcode version greater than 16.2, using visionOS."
|
||||
VISION_OS_BUILD_VERSION="visionos"
|
||||
else
|
||||
echo "Xcode version less than or equal to 16.2, using xros."
|
||||
VISION_OS_BUILD_VERSION="xros"
|
||||
fi
|
||||
xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"tvos")
|
||||
echo "Marking binary as a framework binary for tvOS..."
|
||||
xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
esac
|
||||
|
|
@ -539,27 +517,13 @@ combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
|
|||
|
||||
# Create XCFramework with correct debug symbols paths
|
||||
echo "Creating XCFramework..."
|
||||
|
||||
if [[ "${BUILD_STATIC_XCFRAMEWORK}" == "ON" ]]; then
|
||||
xcodebuild -create-xcframework \
|
||||
-framework $(pwd)/build-ios-sim/framework/whisper.framework \
|
||||
-framework $(pwd)/build-ios-device/framework/whisper.framework \
|
||||
-framework $(pwd)/build-macos/framework/whisper.framework \
|
||||
-framework $(pwd)/build-visionos/framework/whisper.framework \
|
||||
-framework $(pwd)/build-visionos-sim/framework/whisper.framework \
|
||||
-framework $(pwd)/build-tvos-device/framework/whisper.framework \
|
||||
-framework $(pwd)/build-tvos-sim/framework/whisper.framework \
|
||||
-output $(pwd)/build-apple/whisper.xcframework
|
||||
exit 0
|
||||
fi
|
||||
|
||||
xcodebuild -create-xcframework \
|
||||
-framework $(pwd)/build-ios-sim/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-ios-sim/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-ios-device/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-ios-device/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-macos/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMs/whisper.dSYM \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMS/whisper.dSYM \
|
||||
-framework $(pwd)/build-visionos/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-visionos/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-visionos-sim/framework/whisper.framework \
|
||||
|
|
|
|||
93
ci/run.sh
93
ci/run.sh
|
|
@ -24,9 +24,9 @@ mkdir -p "$2"
|
|||
OUT=$(realpath "$1")
|
||||
MNT=$(realpath "$2")
|
||||
|
||||
rm -vf $OUT/*.log
|
||||
rm -vf $OUT/*.exit
|
||||
rm -vf $OUT/*.md
|
||||
rm -f "$OUT/*.log"
|
||||
rm -f "$OUT/*.exit"
|
||||
rm -f "$OUT/*.md"
|
||||
|
||||
sd=`dirname $0`
|
||||
cd $sd/../
|
||||
|
|
@ -50,35 +50,8 @@ fi
|
|||
|
||||
CMAKE_EXTRA="-DWHISPER_FATAL_WARNINGS=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
|
||||
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
|
||||
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
|
||||
else
|
||||
echo "Warning: Using fallback CUDA architectures"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
|
||||
fi
|
||||
else
|
||||
echo "Error: nvidia-smi not found, cannot build with CUDA"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_ROCM} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
|
||||
if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
|
||||
echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
|
|
@ -87,38 +60,28 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
|||
echo "source /opt/intel/oneapi/setvars.sh"
|
||||
exit 1
|
||||
fi
|
||||
# Use only main GPU
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
||||
# Enable sysman for correct memory reporting
|
||||
export ZES_ENABLE_SYSMAN=1
|
||||
# to circumvent precision issues on CPY operations
|
||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DWHISPER_OPENVINO=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
|
||||
# if on Mac, disable METAL
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_MUSA} ]; then
|
||||
# Use qy1 by default (MTT S80)
|
||||
MUSA_ARCH=${MUSA_ARCH:-21}
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_NO_SVE} ]; then
|
||||
# arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
|
||||
if [ ! -z ${GG_BUILD_COREML} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DWHISPER_COREML=ON"
|
||||
fi
|
||||
|
||||
## helpers
|
||||
|
|
@ -215,7 +178,7 @@ function gg_run_ctest {
|
|||
mode=$2
|
||||
|
||||
cd ${SRC}
|
||||
|
||||
|
||||
rm -rf build-ci-${mode} && mkdir build-ci-${mode} && cd build-ci-${mode}
|
||||
|
||||
set -e
|
||||
|
|
@ -246,7 +209,7 @@ function gg_run_bench {
|
|||
cd ${SRC}
|
||||
|
||||
# set flash attention flag if enabled
|
||||
fattn="-nfa"
|
||||
fattn=""
|
||||
if [ "$BENCH_FLASH_ATTN" -eq 1 ]; then
|
||||
fattn="-fa"
|
||||
fi
|
||||
|
|
@ -256,7 +219,7 @@ function gg_run_bench {
|
|||
echo "Running memcpy benchmark"
|
||||
(time ./build-ci-release/bin/whisper-bench -w 1 -t $BENCH_N_THREADS 2>&1) | tee -a $OUT/${ci}-memcpy.log
|
||||
gg_check_last_command_status "$OUT/${ci}-memcpy.exit" "memcpy benchmark"
|
||||
|
||||
|
||||
echo "Running ggml_mul_mat benchmark with $BENCH_N_THREADS threads"
|
||||
(time ./build-ci-release/bin/whisper-bench -w 2 -t $BENCH_N_THREADS 2>&1) | tee -a $OUT/${ci}-mul_mat.log
|
||||
gg_check_last_command_status "$OUT/${ci}-mul_mat.exit" "ggml_mul_mat benchmark"
|
||||
|
|
@ -270,8 +233,6 @@ function gg_run_bench {
|
|||
printf "| %16s | %13s | %3s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---"
|
||||
} | tee -a $OUT/${ci}-models-table.log
|
||||
|
||||
res=0
|
||||
|
||||
# run benchmark for each model
|
||||
for model in "${MODELS[@]}"; do
|
||||
echo "Benchmarking model: $model"
|
||||
|
|
@ -322,11 +283,8 @@ function gg_run_bench {
|
|||
| tee -a $OUT/${ci}-models-table.log
|
||||
else
|
||||
echo "Benchmark failed for model: $model" | tee -a $OUT/${ci}-bench-errors.log
|
||||
res=1
|
||||
fi
|
||||
done
|
||||
|
||||
return $res
|
||||
}
|
||||
|
||||
function gg_sum_bench {
|
||||
|
|
@ -368,12 +326,11 @@ ret=0
|
|||
for model in "${MODELS[@]}"; do
|
||||
test $ret -eq 0 && gg_download_model ${model}
|
||||
done
|
||||
|
||||
test $ret -eq 0 && gg_run ctest debug
|
||||
if [ -z ${GG_BUILD_SYCL}]; then
|
||||
test $ret -eq 0 && gg_run ctest debug
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest release
|
||||
|
||||
test $ret -eq 0 && gg_run bench
|
||||
|
||||
cat $OUT/README.md
|
||||
|
||||
exit $ret
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ jobs:
|
|||
issues: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@v10
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
||||
days-before-issue-stale: 30
|
||||
|
|
|
|||
|
|
@ -1,16 +0,0 @@
|
|||
set( CMAKE_SYSTEM_NAME Darwin )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-apple-darwin-macho )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_PROCESSOR riscv64)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
|
||||
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
|
||||
message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
else()
|
||||
set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
|
||||
if (DEFINED ENV{RISCV_ROOT_PATH})
|
||||
file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
|
||||
else()
|
||||
message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
|
||||
endif()
|
||||
|
||||
set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
|
||||
set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
|
||||
set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
|
||||
set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
|
||||
set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
|
||||
set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
|
||||
endif()
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
|
||||
|
|
@ -3,25 +3,60 @@ set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
|
|||
set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
|
||||
set(WHISPER_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
|
||||
set(GGML_BLAS @GGML_BLAS@)
|
||||
set(GGML_CUDA @GGML_CUDA@)
|
||||
set(GGML_METAL @GGML_METAL@)
|
||||
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(WHISPER_LIB_DIR "@PACKAGE_WHISPER_LIB_INSTALL_DIR@")
|
||||
set_and_check(WHISPER_BIN_DIR "@PACKAGE_WHISPER_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
||||
# Ensure transient dependencies satisfied
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_HIPBLAS)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
endif()
|
||||
|
||||
find_library(whisper_LIBRARY whisper
|
||||
REQUIRED
|
||||
HINTS ${WHISPER_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH
|
||||
)
|
||||
HINTS ${WHISPER_LIB_DIR})
|
||||
|
||||
set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@")
|
||||
set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@")
|
||||
|
||||
add_library(whisper UNKNOWN IMPORTED)
|
||||
|
||||
set_target_properties(whisper
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
||||
INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}"
|
||||
INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${whisper_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||
|
|
|
|||
|
|
@ -1,5 +0,0 @@
|
|||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
|
@ -98,7 +98,6 @@ if (EMSCRIPTEN)
|
|||
add_subdirectory(stream.wasm)
|
||||
add_subdirectory(command.wasm)
|
||||
add_subdirectory(bench.wasm)
|
||||
add_subdirectory(wchess)
|
||||
elseif(CMAKE_JS_VERSION)
|
||||
add_subdirectory(addon.node)
|
||||
else()
|
||||
|
|
@ -106,7 +105,6 @@ else()
|
|||
add_subdirectory(bench)
|
||||
add_subdirectory(server)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(vad-speech-segments)
|
||||
if (WHISPER_SDL2)
|
||||
add_subdirectory(stream)
|
||||
add_subdirectory(command)
|
||||
|
|
|
|||
|
|
@ -1,10 +1,8 @@
|
|||
# whisper.cpp Node.js addon
|
||||
# addon
|
||||
|
||||
This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
|
||||
It can be used as a reference for using the whisper.cpp project in other node projects.
|
||||
|
||||
This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance.
|
||||
|
||||
## Install
|
||||
|
||||
```shell
|
||||
|
|
@ -28,88 +26,12 @@ For Electron addon and cmake-js options, you can see [cmake-js](https://github.c
|
|||
|
||||
## Run
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```shell
|
||||
cd examples/addon.node
|
||||
|
||||
node index.js --language='language' --model='model-path' --fname_inp='file-path'
|
||||
```
|
||||
|
||||
### VAD (Voice Activity Detection) Usage
|
||||
Because this is a simple Demo, only the above parameters are set in the node environment.
|
||||
|
||||
Run the VAD example with performance comparison:
|
||||
|
||||
```shell
|
||||
node vad-example.js
|
||||
```
|
||||
|
||||
## Voice Activity Detection (VAD) Support
|
||||
|
||||
VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
|
||||
|
||||
### VAD Model Setup
|
||||
|
||||
Before using VAD, download a VAD model:
|
||||
|
||||
```shell
|
||||
# From the whisper.cpp root directory
|
||||
./models/download-vad-model.sh silero-v6.2.0
|
||||
```
|
||||
|
||||
### VAD Parameters
|
||||
|
||||
All VAD parameters are optional and have sensible defaults:
|
||||
|
||||
- `vad`: Enable VAD (default: false)
|
||||
- `vad_model`: Path to VAD model file (required when VAD enabled)
|
||||
- `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5)
|
||||
- `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250)
|
||||
- `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100)
|
||||
- `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX)
|
||||
- `vad_speech_pad_ms`: Speech padding in ms (default: 30)
|
||||
- `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1)
|
||||
|
||||
### JavaScript API Example
|
||||
|
||||
```javascript
|
||||
const path = require("path");
|
||||
const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node"));
|
||||
const { promisify } = require("util");
|
||||
|
||||
const whisperAsync = promisify(whisper);
|
||||
|
||||
// With VAD enabled
|
||||
const vadParams = {
|
||||
language: "en",
|
||||
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
||||
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
||||
vad: true,
|
||||
vad_model: path.join(__dirname, "../../models/ggml-silero-v6.2.0.bin"),
|
||||
vad_threshold: 0.5,
|
||||
progress_callback: (progress) => console.log(`Progress: ${progress}%`)
|
||||
};
|
||||
|
||||
whisperAsync(vadParams).then(result => console.log(result));
|
||||
```
|
||||
|
||||
## Supported Parameters
|
||||
|
||||
Both traditional whisper.cpp parameters and new VAD parameters are supported:
|
||||
|
||||
- `language`: Language code (e.g., "en", "es", "fr")
|
||||
- `model`: Path to whisper model file
|
||||
- `fname_inp`: Path to input audio file
|
||||
- `use_gpu`: Enable GPU acceleration (default: true)
|
||||
- `flash_attn`: Enable flash attention (default: false)
|
||||
- `no_prints`: Disable console output (default: false)
|
||||
- `no_timestamps`: Disable timestamps (default: false)
|
||||
- `detect_language`: Auto-detect language (default: false)
|
||||
- `audio_ctx`: Audio context size (default: 0)
|
||||
- `max_len`: Maximum segment length (default: 0)
|
||||
- `max_context`: Maximum context size (default: -1)
|
||||
- `prompt`: Initial prompt for decoder
|
||||
- `comma_in_time`: Use comma in timestamps (default: true)
|
||||
- `print_progress`: Print progress info (default: false)
|
||||
- `progress_callback`: Progress callback function
|
||||
- VAD parameters (see above section)
|
||||
Other parameters can also be specified in the node environment.
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue