Merge remote-tracking branch 'origin/master' into stream-pcm
This commit is contained in:
commit
200bcecbb1
|
|
@ -0,0 +1,20 @@
|
|||
FROM ubuntu:24.04 AS build
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential wget cmake git libvulkan-dev glslc \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY .. .
|
||||
RUN make base.en CMAKE_ARGS="-DGGML_VULKAN=1"
|
||||
|
||||
FROM ubuntu:24.04 AS runtime
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl ffmpeg libsdl2-dev wget cmake git libvulkan1 mesa-vulkan-drivers \
|
||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||
|
||||
COPY --from=build /app /app
|
||||
ENV PATH=/app/build/bin:$PATH
|
||||
ENTRYPOINT [ "bash", "-c" ]
|
||||
|
|
@ -13,10 +13,10 @@ jobs:
|
|||
ubuntu-22:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/setup-go@v5
|
||||
- uses: actions/setup-go@v6
|
||||
with:
|
||||
go-version: '^1.23'
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
- run: |
|
||||
cd bindings/go
|
||||
make test
|
||||
|
|
|
|||
|
|
@ -17,5 +17,5 @@ jobs:
|
|||
- uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: '3.2'
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
- run: rake test
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Checkout with full history
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
|
@ -127,7 +127,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -159,7 +159,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -174,10 +174,6 @@ jobs:
|
|||
sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
|
||||
apt-get update
|
||||
apt-get install -y ca-certificates
|
||||
sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
|
||||
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev cmake git
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
|
||||
|
|
@ -195,7 +191,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -210,10 +206,6 @@ jobs:
|
|||
sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
|
||||
apt-get update
|
||||
apt-get install -y ca-certificates
|
||||
sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
|
||||
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev cmake git
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
|
||||
|
|
@ -231,7 +223,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
|
|
@ -263,7 +255,7 @@ jobs:
|
|||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Build
|
||||
# uses: cross-platform-actions/action@v0.27.0
|
||||
|
|
@ -289,7 +281,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -323,7 +315,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -338,10 +330,6 @@ jobs:
|
|||
sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
|
||||
apt-get update
|
||||
apt-get install -y ca-certificates
|
||||
sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
|
||||
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev git
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
|
||||
|
|
@ -361,7 +349,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -376,10 +364,6 @@ jobs:
|
|||
sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
|
||||
apt-get update
|
||||
apt-get install -y ca-certificates
|
||||
sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
|
||||
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev git
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
|
||||
|
|
@ -402,7 +386,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -417,10 +401,6 @@ jobs:
|
|||
sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
|
||||
|
||||
apt-get update
|
||||
apt-get install -y ca-certificates
|
||||
sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
|
||||
|
||||
apt update
|
||||
apt install -y clang build-essential cmake libsdl2-dev git
|
||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||
|
|
@ -440,7 +420,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -480,7 +460,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
|
|
@ -504,7 +484,7 @@ jobs:
|
|||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
|
@ -532,7 +512,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
|
|
@ -556,7 +536,7 @@ jobs:
|
|||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
|
@ -581,7 +561,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
uses: msys2/setup-msys2@v2
|
||||
|
|
@ -636,7 +616,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v2
|
||||
|
|
@ -666,31 +646,31 @@ jobs:
|
|||
|
||||
- name: Upload SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ${{ matrix.s2arc }}_SDL2.dll
|
||||
path: build/bin/${{ matrix.build }}/SDL2.dll
|
||||
|
||||
- name: Upload whisper dll
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: whisper_${{ matrix.arch }}.dll
|
||||
path: build/bin/${{ matrix.build }}/whisper.dll
|
||||
|
||||
- name: Upload ggml dll
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ggml_${{ matrix.arch }}.dll
|
||||
path: build/bin/${{ matrix.build }}/ggml.dll
|
||||
|
||||
- name: Upload ggml base dll
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ggml_base_${{ matrix.arch }}.dll
|
||||
path: build/bin/${{ matrix.build }}/ggml-base.dll
|
||||
|
||||
- name: Upload ggml cpu dll
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ggml_cpu_${{ matrix.arch }}.dll
|
||||
path: build/bin/${{ matrix.build }}/ggml-cpu.dll
|
||||
|
|
@ -702,7 +682,7 @@ jobs:
|
|||
|
||||
- name: Upload binaries
|
||||
if: matrix.sdl2 == 'ON' && ${{ needs.determine-tag.outputs.should_release }}
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: whisper-bin-${{ matrix.arch }}.zip
|
||||
path: whisper-bin-${{ matrix.arch }}.zip
|
||||
|
|
@ -731,10 +711,10 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Export GitHub Actions cache environment variables
|
||||
uses: actions/github-script@v7
|
||||
uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
|
||||
|
|
@ -788,7 +768,7 @@ jobs:
|
|||
|
||||
- name: Upload binaries
|
||||
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON' && ${{ needs.determine-tag.outputs.should_release }}
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: whisper-blas-bin-${{ matrix.arch }}.zip
|
||||
path: whisper-blas-bin-${{ matrix.arch }}.zip
|
||||
|
|
@ -812,7 +792,7 @@ jobs:
|
|||
sdl2_ver: 2.28.5
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
|
|
@ -997,7 +977,7 @@ jobs:
|
|||
|
||||
- name: Upload binaries
|
||||
if: ${{ needs.determine-tag.outputs.should_release }}
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}.zip
|
||||
path: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}.zip
|
||||
|
|
@ -1013,7 +993,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup emsdk
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
|
|
@ -1036,7 +1016,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Configure
|
||||
run: |
|
||||
|
|
@ -1078,7 +1058,7 @@ jobs:
|
|||
|
||||
- name: Upload artifacts
|
||||
if: ${{ needs.determine-tag.outputs.should_release }}
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip
|
||||
name: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip
|
||||
|
|
@ -1090,12 +1070,12 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
path: whisper
|
||||
|
||||
- name: Install Java
|
||||
uses: actions/setup-java@v4
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
distribution: zulu
|
||||
java-version: 21
|
||||
|
|
@ -1119,10 +1099,10 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: set up JDK 11
|
||||
uses: actions/setup-java@v4
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: '11'
|
||||
distribution: 'temurin'
|
||||
|
|
@ -1145,36 +1125,36 @@ jobs:
|
|||
needs: ['windows']
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Install Java
|
||||
uses: actions/setup-java@v4
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
distribution: zulu
|
||||
java-version: 20
|
||||
|
||||
- name: Download Whisper Windows lib
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: whisper_x64.dll
|
||||
|
||||
- name: Download GGML Windows lib
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ggml_x64.dll
|
||||
|
||||
- name: Download GGML Base Windows lib
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ggml_base_x64.dll
|
||||
|
||||
- name: Download GGML CPU Windows lib
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ggml_cpu_x64.dll
|
||||
|
||||
- name: Download SDL2.dll
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: x64_SDL2.dll
|
||||
|
||||
|
|
@ -1221,7 +1201,7 @@ jobs:
|
|||
Compress-Archive -Path "bindings/java/build/libs/whispercpp-*.jar" -DestinationPath "whispercpp.jar.zip"
|
||||
|
||||
- name: Upload jar
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: whispercpp.jar.zip
|
||||
path: whispercpp.jar.zip
|
||||
|
|
@ -1245,7 +1225,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test quantize
|
||||
run: |
|
||||
|
|
@ -1269,7 +1249,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
|
@ -1282,7 +1262,7 @@ jobs:
|
|||
# Downloads all the artifacts from the previous jobs
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
|
|
@ -1332,7 +1312,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set environment variables
|
||||
id: set_vars
|
||||
|
|
@ -1358,7 +1338,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build
|
||||
shell: bash
|
||||
|
|
@ -1378,7 +1358,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
|
|
@ -1403,7 +1383,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
|
|
@ -1428,7 +1408,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
|
|
@ -1453,7 +1433,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
|
|
@ -1478,7 +1458,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
|
|
@ -1503,7 +1483,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1517,7 +1497,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1531,7 +1511,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1545,7 +1525,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1558,7 +1538,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1571,7 +1551,7 @@ jobs:
|
|||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
|
|||
|
|
@ -22,10 +22,11 @@ jobs:
|
|||
- { tag: "main-musa", dockerfile: ".devops/main-musa.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-intel", dockerfile: ".devops/main-intel.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
|
||||
- { tag: "main-vulkan", dockerfile: ".devops/main-vulkan.Dockerfile", platform: "linux/amd64" }
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
|
@ -67,7 +68,7 @@ jobs:
|
|||
echo "tags=$TAGS" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker image (tagged)
|
||||
uses: docker/build-push-action@v5
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
|
|
|
|||
|
|
@ -22,10 +22,10 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Pages
|
||||
uses: actions/configure-pages@v4
|
||||
uses: actions/configure-pages@v5
|
||||
|
||||
- name: Setup emsdk
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
|
|
@ -88,7 +88,7 @@ jobs:
|
|||
find staging -type f | sort
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v3
|
||||
uses: actions/upload-pages-artifact@v4
|
||||
with:
|
||||
path: ./staging
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ jobs:
|
|||
node-version: [ 16.x, 18.x ]
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
|
|
@ -27,7 +27,7 @@ jobs:
|
|||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v1
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
cache: 'npm'
|
||||
|
|
|
|||
2
LICENSE
2
LICENSE
|
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023-2024 The ggml authors
|
||||
Copyright (c) 2023-2026 The ggml authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
|||
17
README.md
17
README.md
|
|
@ -443,11 +443,12 @@ ffmpeg -i samples/jfk.wav jfk.opus
|
|||
|
||||
### Images
|
||||
|
||||
We have two Docker images available for this project:
|
||||
We have multiple Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggml-org/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggml-org/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
3. `ghcr.io/ggml-org/whisper.cpp:main-musa`: Same as `main` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
4. `ghcr.io/ggml-org/whisper.cpp:main-vulkan`: Same as `main` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
|
||||
### Usage
|
||||
|
||||
|
|
@ -456,15 +457,27 @@ We have two Docker images available for this project:
|
|||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "./models/download-ggml-model.sh base /models"
|
||||
|
||||
# transcribe an audio file
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
-v path/to/audios:/audios \
|
||||
whisper.cpp:main "whisper-cli -m /models/ggml-base.bin -f /audios/jfk.wav"
|
||||
|
||||
# transcribe an audio file in samples folder
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "whisper-cli -m /models/ggml-base.bin -f ./samples/jfk.wav"
|
||||
|
||||
# run the web server
|
||||
docker run -it --rm -p "8080:8080" \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "whisper-server --host 127.0.0.1 -m /models/ggml-base.bin"
|
||||
|
||||
# run the bench too on the small.en model using 4 threads
|
||||
docker run -it --rm \
|
||||
-v path/to/models:/models \
|
||||
whisper.cpp:main "whisper-bench -m /models/ggml-small.en.bin -t 4"
|
||||
```
|
||||
|
||||
## Installing with Conan
|
||||
|
|
@ -750,7 +763,7 @@ argument to `whisper-cli`. In addition to this option a VAD model is also
|
|||
required.
|
||||
|
||||
The way this works is that first the audio samples are passed through
|
||||
the VAD model which will detect speech segments. Using this information the
|
||||
the VAD model which will detect speech segments. Using this information,
|
||||
only the speech segments that are detected are extracted from the original audio
|
||||
input and passed to whisper for processing. This reduces the amount of audio
|
||||
data that needs to be processed by whisper and can significantly speed up the
|
||||
|
|
|
|||
|
|
@ -282,13 +282,20 @@ func Download(ctx context.Context, p io.Writer, model, out string) (string, erro
|
|||
default:
|
||||
// Read body
|
||||
n, err := resp.Body.Read(data)
|
||||
if n > 0 {
|
||||
if m, err := w.Write(data[:n]); err != nil {
|
||||
return path, err
|
||||
} else {
|
||||
count += int64(m)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
DownloadReport(p, pct, count, resp.ContentLength)
|
||||
if err == io.EOF {
|
||||
DownloadReport(p, pct, count, resp.ContentLength)
|
||||
return path, nil
|
||||
}
|
||||
return path, err
|
||||
} else if m, err := w.Write(data[:n]); err != nil {
|
||||
return path, err
|
||||
} else {
|
||||
count += int64(m)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -247,6 +247,58 @@ whisper.transcribe("path/to/audio.wav", params)
|
|||
|
||||
```
|
||||
|
||||
### Tokens ###
|
||||
|
||||
Each segment has tokens.
|
||||
|
||||
To enable token timestamps, you need to set `Whisper::Params#token_timestamps = true`. Then, retrieve tokens from segments using `Whisper::Segment#each_token`.
|
||||
|
||||
```ruby
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new(token_timestamps: true)
|
||||
whisper
|
||||
.transcribe("path/to/audio.wav", params)
|
||||
.each_segment do |segment|
|
||||
segment.each_token do |token|
|
||||
token => {start_time:, end_time:, text:, probability:}
|
||||
st = "%05.2fs" % (start_time / 1000.0)
|
||||
et = "%05.2fs" % (end_time / 1000.0)
|
||||
prob = "%.1f%%" % (probability * 100)
|
||||
puts "[#{st} --> #{et}] #{text} (#{prob})"
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
```
|
||||
[00.00s --> 00.00s] [_BEG_] (84.2%)
|
||||
[00.32s --> 00.37s] And (71.2%)
|
||||
[00.37s --> 00.53s] so (98.5%)
|
||||
[00.69s --> 00.85s] my (70.7%)
|
||||
[00.85s --> 01.59s] fellow (99.5%)
|
||||
[01.59s --> 02.10s] Americans (90.1%)
|
||||
[02.85s --> 03.30s] , (28.4%)
|
||||
[03.30s --> 04.14s] ask (79.8%)
|
||||
[04.14s --> 04.28s] not (78.9%)
|
||||
[05.03s --> 05.35s] what (93.3%)
|
||||
[05.41s --> 05.74s] your (98.8%)
|
||||
[05.74s --> 06.41s] country (99.6%)
|
||||
[06.41s --> 06.74s] can (97.7%)
|
||||
[06.74s --> 06.92s] do (99.0%)
|
||||
[07.00s --> 07.00s] for (95.8%)
|
||||
[07.01s --> 07.52s] you (98.5%)
|
||||
[07.81s --> 08.05s] , (49.3%)
|
||||
[08.19s --> 08.37s] ask (65.6%)
|
||||
[08.37s --> 08.75s] what (98.8%)
|
||||
[08.91s --> 09.04s] you (98.2%)
|
||||
[09.04s --> 09.32s] can (96.9%)
|
||||
[09.32s --> 09.38s] do (90.3%)
|
||||
[09.44s --> 09.76s] for (91.8%)
|
||||
[09.76s --> 09.99s] your (98.2%)
|
||||
[10.02s --> 10.36s] country (99.6%)
|
||||
[10.51s --> 10.99s] . (87.0%)
|
||||
[11.00s --> 11.00s] [_TT_550] (7.6%)
|
||||
```
|
||||
|
||||
### Models ###
|
||||
|
||||
You can see model information:
|
||||
|
|
@ -323,7 +375,38 @@ whisper
|
|||
end
|
||||
```
|
||||
|
||||
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
|
||||
The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView.
|
||||
|
||||
If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
|
||||
|
||||
```ruby
|
||||
require "torchaudio"
|
||||
require "arrow-numo-narray"
|
||||
require "whisper"
|
||||
|
||||
waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav")
|
||||
# Convert Torch::Tensor to Arrow::Array via Numo::NArray
|
||||
samples = waveform.squeeze.numo.to_arrow.to_arrow_array
|
||||
|
||||
whisper = Whisper::Context.new("base")
|
||||
whisper
|
||||
# Arrow::Array exports MemoryView
|
||||
.full(Whisper::Params.new, samples)
|
||||
```
|
||||
|
||||
Custom context params
|
||||
---------------------
|
||||
|
||||
You can use customize `Whisper::Context`'s behavior using `Whisper::Context::Params`.
|
||||
|
||||
```ruby
|
||||
context_params = Whisper::Context::Params.new(
|
||||
use_gpu: false,
|
||||
flash_attn: false,
|
||||
# etc
|
||||
)
|
||||
whisper = Whisper::Context.new("base", context_params)
|
||||
```
|
||||
|
||||
Using VAD separately from ASR
|
||||
-----------------------------
|
||||
|
|
@ -334,13 +417,27 @@ VAD feature itself is useful. You can use it separately from ASR:
|
|||
vad = Whisper::VAD::Context.new("silero-v6.2.0")
|
||||
vad
|
||||
.detect("path/to/audio.wav", Whisper::VAD::Params.new)
|
||||
.each_with_index do |segment, index|
|
||||
.each.with_index do |segment, index|
|
||||
segment => {start_time: st, end_time: ed} # `Segment` responds to `#deconstruct_keys`
|
||||
|
||||
puts "[%{nth}: %{st} --> %{ed}]" % {nth: index + 1, st:, ed:}
|
||||
end
|
||||
```
|
||||
|
||||
You may also low level API `Whisper::VAD::Context#segments_from_samples` as such `Whisper::Context#full`:
|
||||
|
||||
```ruby
|
||||
# Ruby Array
|
||||
reader = WaveFile::Reader.new("path/to/audio.wav", WaveFile::Format.new(:mono, :float, 16000))
|
||||
samples = reader.enum_for(:each_buffer).map(&:samples).flatten
|
||||
|
||||
# Or, object which exports MemoryView
|
||||
waveform, sample_rate = TorchAudio.load("test/fixtures/jfk.wav")
|
||||
samples = waveform.squeeze.numo.to_arrow.to_arrow_array
|
||||
|
||||
segments = vad.segments_from_samples(Whisper::VAD::Params.new, samples)
|
||||
```
|
||||
|
||||
Development
|
||||
-----------
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ options = Options.new(cmake).to_s
|
|||
have_library("gomp") rescue nil
|
||||
libs = Dependencies.new(cmake, options).to_s
|
||||
|
||||
$CFLAGS << " -O3 -march=native"
|
||||
$INCFLAGS << " -Isources/include -Isources/ggml/include -Isources/examples"
|
||||
$LOCAL_LIBS << " #{libs}"
|
||||
$cleanfiles << " build #{libs}"
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include <ruby/memory_view.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
VALUE mWhisper;
|
||||
|
|
@ -35,7 +33,8 @@ static bool is_log_callback_finalized = false;
|
|||
// High level API
|
||||
extern VALUE ruby_whisper_segment_allocate(VALUE klass);
|
||||
|
||||
extern void init_ruby_whisper_context(VALUE *mWhisper);
|
||||
extern VALUE init_ruby_whisper_context(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_context_params(VALUE *cContext);
|
||||
extern void init_ruby_whisper_params(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_error(VALUE *mWhisper);
|
||||
extern void init_ruby_whisper_segment(VALUE *mWhisper);
|
||||
|
|
@ -164,6 +163,22 @@ void Init_whisper() {
|
|||
rb_define_const(mWhisper, "LOG_LEVEL_DEBUG", INT2NUM(GGML_LOG_LEVEL_DEBUG));
|
||||
rb_define_const(mWhisper, "LOG_LEVEL_CONT", INT2NUM(GGML_LOG_LEVEL_CONT));
|
||||
|
||||
rb_define_const(mWhisper, "AHEADS_NONE", INT2NUM(WHISPER_AHEADS_NONE));
|
||||
rb_define_const(mWhisper, "AHEADS_N_TOP_MOST", INT2NUM(WHISPER_AHEADS_N_TOP_MOST));
|
||||
rb_define_const(mWhisper, "AHEADS_CUSTOM", INT2NUM(WHISPER_AHEADS_CUSTOM));
|
||||
rb_define_const(mWhisper, "AHEADS_TINY_EN", INT2NUM(WHISPER_AHEADS_TINY_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_TINY", INT2NUM(WHISPER_AHEADS_TINY));
|
||||
rb_define_const(mWhisper, "AHEADS_BASE_EN", INT2NUM(WHISPER_AHEADS_BASE_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_BASE", INT2NUM(WHISPER_AHEADS_BASE));
|
||||
rb_define_const(mWhisper, "AHEADS_SMALL_EN", INT2NUM(WHISPER_AHEADS_SMALL_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_SMALL", INT2NUM(WHISPER_AHEADS_SMALL));
|
||||
rb_define_const(mWhisper, "AHEADS_MEDIUM_EN", INT2NUM(WHISPER_AHEADS_MEDIUM_EN));
|
||||
rb_define_const(mWhisper, "AHEADS_MEDIUM", INT2NUM(WHISPER_AHEADS_MEDIUM));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V1", INT2NUM(WHISPER_AHEADS_LARGE_V1));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V2", INT2NUM(WHISPER_AHEADS_LARGE_V2));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V3", INT2NUM(WHISPER_AHEADS_LARGE_V3));
|
||||
rb_define_const(mWhisper, "AHEADS_LARGE_V3_TURBO", INT2NUM(WHISPER_AHEADS_LARGE_V3_TURBO));
|
||||
|
||||
rb_define_singleton_method(mWhisper, "lang_max_id", ruby_whisper_s_lang_max_id, 0);
|
||||
rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
|
||||
rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
|
||||
|
|
@ -172,7 +187,8 @@ void Init_whisper() {
|
|||
rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
|
||||
rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);
|
||||
|
||||
init_ruby_whisper_context(&mWhisper);
|
||||
cContext = init_ruby_whisper_context(&mWhisper);
|
||||
init_ruby_whisper_context_params(&cContext);
|
||||
init_ruby_whisper_params(&mWhisper);
|
||||
init_ruby_whisper_error(&mWhisper);
|
||||
init_ruby_whisper_segment(&mWhisper);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#ifndef RUBY_WHISPER_H
|
||||
#define RUBY_WHISPER_H
|
||||
|
||||
#include <ruby.h>
|
||||
#include <ruby/memory_view.h>
|
||||
#include "whisper.h"
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -14,6 +16,10 @@ typedef struct {
|
|||
struct whisper_context *context;
|
||||
} ruby_whisper;
|
||||
|
||||
typedef struct ruby_whisper_context_params {
|
||||
struct whisper_context_params params;
|
||||
} ruby_whisper_context_params;
|
||||
|
||||
typedef struct {
|
||||
struct whisper_full_params params;
|
||||
bool diarize;
|
||||
|
|
@ -35,7 +41,7 @@ typedef struct {
|
|||
|
||||
typedef struct {
|
||||
whisper_token_data *token_data;
|
||||
const char *text;
|
||||
VALUE text;
|
||||
} ruby_whisper_token;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -55,6 +61,13 @@ typedef struct {
|
|||
struct whisper_vad_context *context;
|
||||
} ruby_whisper_vad_context;
|
||||
|
||||
typedef struct parsed_samples_t {
|
||||
float *samples;
|
||||
int n_samples;
|
||||
rb_memory_view_t memview;
|
||||
bool memview_exported;
|
||||
} parsed_samples_t;
|
||||
|
||||
#define GetContext(obj, rw) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper, &ruby_whisper_type, (rw)); \
|
||||
if ((rw)->context == NULL) { \
|
||||
|
|
@ -62,13 +75,28 @@ typedef struct {
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#define GetToken(obj, rwt) do { \
|
||||
#define GetContextParams(obj, rwcp) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_context_params, &ruby_whisper_context_params_type, (rwcp)); \
|
||||
} while (0)
|
||||
|
||||
#define GetToken(obj, rwt) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_token, &ruby_whisper_token_type, (rwt)); \
|
||||
if ((rwt)->token_data == NULL) { \
|
||||
rb_raise(rb_eRuntimeError, "Not initialized"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define GetVADContext(obj, rwvc) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_vad_context, &ruby_whisper_vad_context_type, (rwvc)); \
|
||||
if ((rwvc)->context == NULL) { \
|
||||
rb_raise(rb_eRuntimeError, "Not initialized"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define GetVADParams(obj, rwvp) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_vad_params, &ruby_whisper_vad_params_type, (rwvp)); \
|
||||
} while (0)
|
||||
|
||||
#define GetVADSegments(obj, rwvss) do { \
|
||||
TypedData_Get_Struct((obj), ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, (rwvss)); \
|
||||
if ((rwvss)->segments == NULL) { \
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include <ruby/memory_view.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
extern ID id_to_s;
|
||||
|
|
@ -20,6 +18,7 @@ extern VALUE eError;
|
|||
extern VALUE cModel;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_params_type;
|
||||
extern const rb_data_type_t ruby_whisper_context_params_type;
|
||||
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
|
||||
extern VALUE rb_whisper_model_s_new(VALUE context);
|
||||
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
||||
|
|
@ -27,6 +26,27 @@ extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
|
|||
|
||||
ID transcribe_option_names[1];
|
||||
|
||||
typedef struct fill_samples_args {
|
||||
float *dest;
|
||||
VALUE *src;
|
||||
int n_samples;
|
||||
} fill_samples_args;
|
||||
|
||||
typedef struct full_args {
|
||||
VALUE *context;
|
||||
VALUE *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
} full_args;
|
||||
|
||||
typedef struct full_parallel_args {
|
||||
VALUE *context;
|
||||
VALUE *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
int n_processors;
|
||||
} full_parallel_args;
|
||||
|
||||
static void
|
||||
ruby_whisper_free(ruby_whisper *rw)
|
||||
{
|
||||
|
|
@ -124,16 +144,25 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
|
|||
{
|
||||
ruby_whisper *rw;
|
||||
VALUE whisper_model_file_path;
|
||||
VALUE context_params;
|
||||
struct whisper_context_params params;
|
||||
|
||||
// TODO: we can support init from buffer here too maybe another ruby object to expose
|
||||
rb_scan_args(argc, argv, "01", &whisper_model_file_path);
|
||||
rb_scan_args(argc, argv, "11", &whisper_model_file_path, &context_params);
|
||||
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
||||
|
||||
whisper_model_file_path = ruby_whisper_normalize_model_path(whisper_model_file_path);
|
||||
if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
|
||||
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
|
||||
}
|
||||
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
|
||||
if (NIL_P(context_params)) {
|
||||
params = whisper_context_default_params();
|
||||
} else {
|
||||
ruby_whisper_context_params *rwcp;
|
||||
GetContextParams(context_params, rwcp);
|
||||
params = rwcp->params;
|
||||
}
|
||||
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), params);
|
||||
if (rw->context == NULL) {
|
||||
rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
|
||||
}
|
||||
|
|
@ -272,6 +301,147 @@ VALUE ruby_whisper_model_type(VALUE self)
|
|||
return rb_str_new2(whisper_model_type_readable(rw->context));
|
||||
}
|
||||
|
||||
static bool
|
||||
check_memory_view(rb_memory_view_t *memview)
|
||||
{
|
||||
if (memview->format != NULL && strcmp(memview->format, "f") != 0) {
|
||||
rb_warn("currently only format \"f\" is supported for MemoryView, but given: %s", memview->format);
|
||||
return false;
|
||||
}
|
||||
if (memview->format != NULL && memview->ndim != 1) {
|
||||
rb_warn("currently only 1 dimensional MemoryView is supported, but given: %zd", memview->ndim);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
fill_samples(VALUE rb_args)
|
||||
{
|
||||
fill_samples_args *args = (fill_samples_args *)rb_args;
|
||||
|
||||
if (RB_TYPE_P(*args->src, T_ARRAY)) {
|
||||
for (int i = 0; i < args->n_samples; i++) {
|
||||
args->dest[i] = RFLOAT_VALUE(rb_ary_entry(*args->src, i));
|
||||
}
|
||||
} else {
|
||||
// TODO: use rb_block_call
|
||||
VALUE iter = rb_funcall(*args->src, id_to_enum, 1, rb_str_new2("each"));
|
||||
for (int i = 0; i < args->n_samples; i++) {
|
||||
// TODO: check if iter is exhausted and raise ArgumentError appropriately
|
||||
VALUE sample = rb_funcall(iter, id_next, 0);
|
||||
args->dest[i] = RFLOAT_VALUE(sample);
|
||||
}
|
||||
}
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
struct parsed_samples_t
|
||||
parse_samples(VALUE *samples, VALUE *n_samples)
|
||||
{
|
||||
bool memview_available = rb_memory_view_available_p(*samples);
|
||||
struct parsed_samples_t parsed = {0};
|
||||
parsed.memview_exported = false;
|
||||
const bool is_array = RB_TYPE_P(*samples, T_ARRAY);
|
||||
|
||||
if (!NIL_P(*n_samples)) {
|
||||
parsed.n_samples = NUM2INT(*n_samples);
|
||||
if (is_array) {
|
||||
if (RARRAY_LEN(*samples) < parsed.n_samples) {
|
||||
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(*samples), parsed.n_samples);
|
||||
}
|
||||
}
|
||||
// Should check when samples.respond_to?(:length)?
|
||||
} else {
|
||||
if (is_array) {
|
||||
if (RARRAY_LEN(*samples) > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
parsed.n_samples = (int)RARRAY_LEN(*samples);
|
||||
} else if (memview_available) {
|
||||
bool memview_got = rb_memory_view_get(*samples, &parsed.memview, RUBY_MEMORY_VIEW_SIMPLE);
|
||||
if (memview_got) {
|
||||
parsed.memview_exported = check_memory_view(&parsed.memview);
|
||||
if (!parsed.memview_exported) {
|
||||
rb_memory_view_release(&parsed.memview);
|
||||
parsed.memview = (rb_memory_view_t){0};
|
||||
}
|
||||
}
|
||||
if (parsed.memview_exported) {
|
||||
ssize_t n_samples_size = parsed.memview.byte_size / parsed.memview.item_size;
|
||||
if (n_samples_size > INT_MAX) {
|
||||
rb_memory_view_release(&parsed.memview);
|
||||
rb_raise(rb_eArgError, "samples are too long: %zd", n_samples_size);
|
||||
}
|
||||
parsed.n_samples = (int)n_samples_size;
|
||||
} else {
|
||||
rb_warn("unable to get a memory view. falls back to Ruby object");
|
||||
if (rb_respond_to(*samples, id_length)) {
|
||||
parsed.n_samples = NUM2INT(rb_funcall(*samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length");
|
||||
}
|
||||
}
|
||||
} else if (rb_respond_to(*samples, id_length)) {
|
||||
parsed.n_samples = NUM2INT(rb_funcall(*samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of float when n_samples is not given");
|
||||
}
|
||||
}
|
||||
|
||||
if (parsed.memview_exported) {
|
||||
parsed.samples = (float *)parsed.memview.data;
|
||||
} else {
|
||||
parsed.samples = ALLOC_N(float, parsed.n_samples);
|
||||
fill_samples_args args = {
|
||||
parsed.samples,
|
||||
samples,
|
||||
parsed.n_samples,
|
||||
};
|
||||
int state;
|
||||
rb_protect(fill_samples, (VALUE)&args, &state);
|
||||
if (state) {
|
||||
xfree(parsed.samples);
|
||||
rb_jump_tag(state);
|
||||
}
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
VALUE
|
||||
release_samples(VALUE rb_parsed_args)
|
||||
{
|
||||
parsed_samples_t *parsed_args = (parsed_samples_t *)rb_parsed_args;
|
||||
|
||||
if (parsed_args->memview_exported) {
|
||||
rb_memory_view_release(&parsed_args->memview);
|
||||
} else {
|
||||
xfree(parsed_args->samples);
|
||||
}
|
||||
*parsed_args = (parsed_samples_t){0};
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
full_body(VALUE rb_args)
|
||||
{
|
||||
full_args *args = (full_args *)rb_args;
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
GetContext(*args->context, rw);
|
||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
|
||||
prepare_transcription(rwp, args->context);
|
||||
int result = whisper_full(rw->context, rwp->params, args->samples, args->n_samples);
|
||||
|
||||
return INT2NUM(result);
|
||||
}
|
||||
|
||||
/*
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
* Not thread safe for same context
|
||||
|
|
@ -289,65 +459,17 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
|||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
||||
}
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
GetContext(self, rw);
|
||||
VALUE params = argv[0];
|
||||
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
VALUE samples = argv[1];
|
||||
int n_samples;
|
||||
rb_memory_view_t view;
|
||||
const bool memory_view_available_p = rb_memory_view_available_p(samples);
|
||||
if (argc == 3) {
|
||||
n_samples = NUM2INT(argv[2]);
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) < n_samples) {
|
||||
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
|
||||
}
|
||||
}
|
||||
// Should check when samples.respond_to?(:length)?
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)RARRAY_LEN(samples);
|
||||
} else if (memory_view_available_p) {
|
||||
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
||||
view.obj = Qnil;
|
||||
rb_raise(rb_eArgError, "unable to get a memory view");
|
||||
}
|
||||
ssize_t n_samples_size = view.byte_size / view.item_size;
|
||||
if (n_samples_size > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)n_samples_size;
|
||||
} else if (rb_respond_to(samples, id_length)) {
|
||||
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
|
||||
}
|
||||
}
|
||||
float * c_samples = (float *)malloc(n_samples * sizeof(float));
|
||||
if (memory_view_available_p) {
|
||||
c_samples = (float *)view.data;
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
|
||||
}
|
||||
} else {
|
||||
// TODO: use rb_block_call
|
||||
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
// TODO: check if iter is exhausted and raise ArgumentError appropriately
|
||||
VALUE sample = rb_funcall(iter, id_next, 0);
|
||||
c_samples[i] = RFLOAT_VALUE(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
prepare_transcription(rwp, &self);
|
||||
const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
|
||||
VALUE n_samples = argc == 2 ? Qnil : argv[2];
|
||||
|
||||
struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples);
|
||||
full_args args = {
|
||||
&self,
|
||||
&argv[0],
|
||||
parsed.samples,
|
||||
parsed.n_samples,
|
||||
};
|
||||
VALUE rb_result = rb_ensure(full_body, (VALUE)&args, release_samples, (VALUE)&parsed);
|
||||
const int result = NUM2INT(rb_result);
|
||||
if (0 == result) {
|
||||
return self;
|
||||
} else {
|
||||
|
|
@ -355,6 +477,22 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
|||
}
|
||||
}
|
||||
|
||||
static VALUE
|
||||
full_parallel_body(VALUE rb_args)
|
||||
{
|
||||
full_parallel_args *args = (full_parallel_args *)rb_args;
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
GetContext(*args->context, rw);
|
||||
TypedData_Get_Struct(*args->params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
|
||||
prepare_transcription(rwp, args->context);
|
||||
int result = whisper_full_parallel(rw->context, rwp->params, args->samples, args->n_samples, args->n_processors);
|
||||
|
||||
return INT2NUM(result);
|
||||
}
|
||||
|
||||
/*
|
||||
* Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
||||
* Result is stored in the default state of the context
|
||||
|
|
@ -372,19 +510,11 @@ static VALUE
|
|||
ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
||||
{
|
||||
if (argc < 2 || argc > 4) {
|
||||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
||||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..4)", argc);
|
||||
}
|
||||
|
||||
ruby_whisper *rw;
|
||||
ruby_whisper_params *rwp;
|
||||
GetContext(self, rw);
|
||||
VALUE params = argv[0];
|
||||
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
||||
VALUE samples = argv[1];
|
||||
int n_samples;
|
||||
VALUE n_samples = argc == 2 ? Qnil : argv[2];
|
||||
int n_processors;
|
||||
rb_memory_view_t view;
|
||||
const bool memory_view_available_p = rb_memory_view_available_p(samples);
|
||||
switch (argc) {
|
||||
case 2:
|
||||
n_processors = 1;
|
||||
|
|
@ -396,56 +526,16 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
|||
n_processors = NUM2INT(argv[3]);
|
||||
break;
|
||||
}
|
||||
if (argc >= 3 && !NIL_P(argv[2])) {
|
||||
n_samples = NUM2INT(argv[2]);
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) < n_samples) {
|
||||
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
|
||||
}
|
||||
}
|
||||
// Should check when samples.respond_to?(:length)?
|
||||
} else if (memory_view_available_p) {
|
||||
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
||||
view.obj = Qnil;
|
||||
rb_raise(rb_eArgError, "unable to get a memory view");
|
||||
}
|
||||
ssize_t n_samples_size = view.byte_size / view.item_size;
|
||||
if (n_samples_size > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)n_samples_size;
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
if (RARRAY_LEN(samples) > INT_MAX) {
|
||||
rb_raise(rb_eArgError, "samples are too long");
|
||||
}
|
||||
n_samples = (int)RARRAY_LEN(samples);
|
||||
} else if (rb_respond_to(samples, id_length)) {
|
||||
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
||||
} else {
|
||||
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
|
||||
}
|
||||
}
|
||||
float * c_samples = (float *)malloc(n_samples * sizeof(float));
|
||||
if (memory_view_available_p) {
|
||||
c_samples = (float *)view.data;
|
||||
} else {
|
||||
if (TYPE(samples) == T_ARRAY) {
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
|
||||
}
|
||||
} else {
|
||||
// FIXME: use rb_block_call
|
||||
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
// TODO: check if iter is exhausted and raise ArgumentError
|
||||
VALUE sample = rb_funcall(iter, id_next, 0);
|
||||
c_samples[i] = RFLOAT_VALUE(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
prepare_transcription(rwp, &self);
|
||||
const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
|
||||
struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples);
|
||||
const full_parallel_args args = {
|
||||
&self,
|
||||
&argv[0],
|
||||
parsed.samples,
|
||||
parsed.n_samples,
|
||||
n_processors,
|
||||
};
|
||||
const VALUE rb_result = rb_ensure(full_parallel_body, (VALUE)&args, release_samples, (VALUE)&parsed);
|
||||
const int result = NUM2INT(rb_result);
|
||||
if (0 == result) {
|
||||
return self;
|
||||
} else {
|
||||
|
|
@ -631,7 +721,7 @@ ruby_whisper_get_model(VALUE self)
|
|||
return rb_whisper_model_s_new(self);
|
||||
}
|
||||
|
||||
void
|
||||
VALUE
|
||||
init_ruby_whisper_context(VALUE *mWhisper)
|
||||
{
|
||||
cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
|
||||
|
|
@ -669,4 +759,6 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
|||
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
||||
|
||||
rb_define_method(cContext, "model", ruby_whisper_get_model, 0);
|
||||
|
||||
return cContext;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,163 @@
|
|||
#include "ruby_whisper.h"
|
||||
|
||||
#define NUM_PARAMS 6
|
||||
|
||||
#define DEF_BOOLEAN_ATTR_METHOD(name) \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_get_ ## name(VALUE self) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
return rwcp->params.name ? Qtrue : Qfalse; \
|
||||
} \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_set_ ## name(VALUE self, VALUE value) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
rwcp->params.name = RTEST(value); \
|
||||
return value; \
|
||||
}
|
||||
|
||||
#define DEF_INT_ATTR_METHOD(name) \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_get_ ## name(VALUE self) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
return INT2NUM(rwcp->params.name); \
|
||||
} \
|
||||
static VALUE \
|
||||
ruby_whisper_context_params_set_ ## name(VALUE self, VALUE value) { \
|
||||
ruby_whisper_context_params *rwcp; \
|
||||
GetContextParams(self, rwcp); \
|
||||
rwcp->params.name = NUM2INT(value); \
|
||||
return value; \
|
||||
}
|
||||
|
||||
#define DEFINE_PARAM(param_name, nth) \
|
||||
id_ ## param_name = rb_intern(#param_name); \
|
||||
param_names[nth] = id_ ## param_name; \
|
||||
rb_define_method(cContextParams, #param_name, ruby_whisper_context_params_get_ ## param_name, 0); \
|
||||
rb_define_method(cContextParams, #param_name "=", ruby_whisper_context_params_set_ ## param_name, 1);
|
||||
|
||||
VALUE cContextParams;
|
||||
|
||||
static ID param_names[NUM_PARAMS];
|
||||
static ID id_use_gpu;
|
||||
static ID id_flash_attn;
|
||||
static ID id_gpu_device;
|
||||
static ID id_dtw_token_timestamps;
|
||||
static ID id_dtw_aheads_preset;
|
||||
static ID id_dtw_n_top;
|
||||
|
||||
static size_t
|
||||
ruby_whisper_context_params_memsize(const void *p)
|
||||
{
|
||||
const ruby_whisper_context_params *rwcp = (ruby_whisper_context_params *)p;
|
||||
if (!rwcp) {
|
||||
return 0;
|
||||
}
|
||||
return sizeof(ruby_whisper_context_params);
|
||||
}
|
||||
|
||||
const rb_data_type_t ruby_whisper_context_params_type = {
|
||||
"ruby_whisper_context_params",
|
||||
{0, RUBY_DEFAULT_FREE, ruby_whisper_context_params_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_s_allocate(VALUE klass)
|
||||
{
|
||||
ruby_whisper_context_params *rwcp;
|
||||
return TypedData_Make_Struct(klass, ruby_whisper_context_params, &ruby_whisper_context_params_type, rwcp);
|
||||
}
|
||||
|
||||
DEF_BOOLEAN_ATTR_METHOD(use_gpu);
|
||||
DEF_BOOLEAN_ATTR_METHOD(flash_attn);
|
||||
DEF_INT_ATTR_METHOD(gpu_device);
|
||||
DEF_BOOLEAN_ATTR_METHOD(dtw_token_timestamps);
|
||||
DEF_INT_ATTR_METHOD(dtw_aheads_preset);
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_get_dtw_n_top(VALUE self) {
|
||||
ruby_whisper_context_params *rwcp;
|
||||
GetContextParams(self, rwcp);
|
||||
|
||||
int dtw_n_top = rwcp->params.dtw_n_top;
|
||||
|
||||
return dtw_n_top == -1 ? Qnil : INT2NUM(dtw_n_top);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_set_dtw_n_top(VALUE self, VALUE value) {
|
||||
ruby_whisper_context_params *rwcp;
|
||||
GetContextParams(self, rwcp);
|
||||
|
||||
rwcp->params.dtw_n_top = NIL_P(value) ? -1 : NUM2INT(value);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
#define SET_PARAM_IF_SAME(param_name) \
|
||||
if (id == id_ ## param_name) { \
|
||||
ruby_whisper_context_params_set_ ## param_name(self, value); \
|
||||
continue; \
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_context_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
{
|
||||
ruby_whisper_context_params *rwcp;
|
||||
TypedData_Get_Struct(self, ruby_whisper_context_params, &ruby_whisper_context_params_type, rwcp);
|
||||
rwcp->params = whisper_context_default_params();
|
||||
|
||||
VALUE kw_hash;
|
||||
rb_scan_args_kw(RB_SCAN_ARGS_KEYWORDS, argc, argv, ":", &kw_hash);
|
||||
if (NIL_P(kw_hash)) {
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
VALUE values[NUM_PARAMS] = {Qundef};
|
||||
rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
|
||||
|
||||
ID id;
|
||||
VALUE value;
|
||||
for (int i = 0; i < NUM_PARAMS; i++) {
|
||||
id = param_names[i];
|
||||
value = values[i];
|
||||
if (value == Qundef) {
|
||||
continue;
|
||||
}
|
||||
SET_PARAM_IF_SAME(use_gpu)
|
||||
SET_PARAM_IF_SAME(flash_attn)
|
||||
SET_PARAM_IF_SAME(gpu_device)
|
||||
SET_PARAM_IF_SAME(dtw_token_timestamps)
|
||||
SET_PARAM_IF_SAME(dtw_aheads_preset)
|
||||
SET_PARAM_IF_SAME(dtw_n_top)
|
||||
}
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
#undef SET_PARAM_IF_SAME
|
||||
|
||||
void
|
||||
init_ruby_whisper_context_params(VALUE *cContext)
|
||||
{
|
||||
cContextParams = rb_define_class_under(*cContext, "Params", rb_cObject);
|
||||
|
||||
rb_define_alloc_func(cContextParams, ruby_whisper_context_params_s_allocate);
|
||||
rb_define_method(cContextParams, "initialize", ruby_whisper_context_params_initialize, -1);
|
||||
|
||||
DEFINE_PARAM(use_gpu, 0)
|
||||
DEFINE_PARAM(flash_attn, 1)
|
||||
DEFINE_PARAM(gpu_device, 2)
|
||||
DEFINE_PARAM(dtw_token_timestamps, 3)
|
||||
DEFINE_PARAM(dtw_aheads_preset, 4)
|
||||
DEFINE_PARAM(dtw_n_top, 5)
|
||||
}
|
||||
|
||||
#undef DEFINE_PARAM
|
||||
#undef DEF_INT_ATTR_METHOD
|
||||
#undef DEF_BOOLEAN_ATTR_METHOD
|
||||
#undef NUM_PARAMS
|
||||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define BOOL_PARAMS_SETTER(self, prop, value) \
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 6
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 11
|
||||
|
|
@ -25,12 +24,34 @@ ruby_whisper_token_memsize(const void *p)
|
|||
if (!rwt) {
|
||||
return 0;
|
||||
}
|
||||
return sizeof(rwt);
|
||||
size_t size = sizeof(*rwt);
|
||||
if (rwt->token_data) {
|
||||
size += sizeof(*rwt->token_data);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void
|
||||
ruby_whisper_token_mark(void *p)
|
||||
{
|
||||
ruby_whisper_token *rwt = (ruby_whisper_token *)p;
|
||||
rb_gc_mark(rwt->text);
|
||||
}
|
||||
|
||||
static void
|
||||
ruby_whisper_token_free(void *p)
|
||||
{
|
||||
ruby_whisper_token *rwt = (ruby_whisper_token *)p;
|
||||
if (rwt->token_data) {
|
||||
xfree(rwt->token_data);
|
||||
rwt->token_data = NULL;
|
||||
}
|
||||
xfree(rwt);
|
||||
}
|
||||
|
||||
static const rb_data_type_t ruby_whisper_token_type = {
|
||||
"ruby_whisper_token",
|
||||
{0, RUBY_DEFAULT_FREE, ruby_whisper_token_memsize,},
|
||||
{ruby_whisper_token_mark, ruby_whisper_token_free, ruby_whisper_token_memsize,},
|
||||
0, 0,
|
||||
0
|
||||
};
|
||||
|
|
@ -41,19 +62,19 @@ ruby_whisper_token_allocate(VALUE klass)
|
|||
ruby_whisper_token *rwt;
|
||||
VALUE token = TypedData_Make_Struct(klass, ruby_whisper_token, &ruby_whisper_token_type, rwt);
|
||||
rwt->token_data = NULL;
|
||||
rwt->text = NULL;
|
||||
rwt->text = Qnil;
|
||||
return token;
|
||||
}
|
||||
|
||||
VALUE
|
||||
ruby_whisper_token_s_init(struct whisper_context *context, int i_segment, int i_token)
|
||||
{
|
||||
whisper_token_data token_data = whisper_full_get_token_data(context, i_segment, i_token);
|
||||
const VALUE token = ruby_whisper_token_allocate(cToken);
|
||||
ruby_whisper_token *rwt;
|
||||
TypedData_Get_Struct(token, ruby_whisper_token, &ruby_whisper_token_type, rwt);
|
||||
rwt->token_data = &token_data;
|
||||
rwt->text = whisper_full_get_token_text(context, i_segment, i_token);
|
||||
rwt->token_data = ALLOC(whisper_token_data);
|
||||
*(rwt->token_data) = whisper_full_get_token_data(context, i_segment, i_token);
|
||||
rwt->text = rb_str_new2(whisper_full_get_token_text(context, i_segment, i_token));
|
||||
return token;
|
||||
}
|
||||
|
||||
|
|
@ -183,10 +204,9 @@ ruby_whisper_token_get_text(VALUE self)
|
|||
{
|
||||
ruby_whisper_token *rwt;
|
||||
GetToken(self, rwt);
|
||||
return rb_str_new2(rwt->text);
|
||||
return rwt->text;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Start time of the token.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
#include "common-whisper.h"
|
||||
#include <string>
|
||||
|
|
@ -13,6 +12,7 @@ extern const rb_data_type_t ruby_whisper_params_type;
|
|||
|
||||
extern ID id_to_s;
|
||||
extern ID id_call;
|
||||
extern ID id_to_path;
|
||||
extern ID transcribe_option_names[1];
|
||||
|
||||
extern void
|
||||
|
|
@ -50,6 +50,9 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|||
rb_raise(rb_eRuntimeError, "Expected file path to wave file");
|
||||
}
|
||||
|
||||
if (rb_respond_to(wave_file_path, id_to_path)) {
|
||||
wave_file_path = rb_funcall(wave_file_path, id_to_path, 0);
|
||||
}
|
||||
std::string fname_inp = StringValueCStr(wave_file_path);
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
|
|
|
|||
|
|
@ -1,12 +1,23 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
extern ID id_to_s;
|
||||
|
||||
extern VALUE cVADContext;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_vad_params_type;
|
||||
extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params);
|
||||
extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
|
||||
extern parsed_samples_t parse_samples(VALUE *samples, VALUE *n_samples);
|
||||
extern VALUE release_samples(VALUE parsed);
|
||||
|
||||
extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments);
|
||||
|
||||
typedef struct segments_from_samples_args {
|
||||
VALUE *context;
|
||||
VALUE *params;
|
||||
float *samples;
|
||||
int n_samples;
|
||||
} segments_from_samples_args;
|
||||
|
||||
static size_t
|
||||
ruby_whisper_vad_context_memsize(const void *p)
|
||||
|
|
@ -66,10 +77,46 @@ ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path)
|
|||
return Qnil;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
segments_from_samples_body(VALUE rb_args)
|
||||
{
|
||||
segments_from_samples_args *args = (segments_from_samples_args *)rb_args;
|
||||
|
||||
ruby_whisper_vad_context *rwvc;
|
||||
ruby_whisper_vad_params *rwvp;
|
||||
GetVADContext(*args->context, rwvc);
|
||||
GetVADParams(*args->params, rwvp);
|
||||
|
||||
struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, args->samples, args->n_samples);
|
||||
|
||||
return ruby_whisper_vad_segments_s_init(segments);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
ruby_whisper_vad_segments_from_samples(int argc, VALUE *argv, VALUE self)
|
||||
{
|
||||
if (argc < 2 || argc > 3) {
|
||||
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
||||
}
|
||||
|
||||
VALUE n_samples = argc == 2 ? Qnil : argv[2];
|
||||
struct parsed_samples_t parsed = parse_samples(&argv[1], &n_samples);
|
||||
segments_from_samples_args args = {
|
||||
&self,
|
||||
&argv[0],
|
||||
parsed.samples,
|
||||
parsed.n_samples,
|
||||
};
|
||||
VALUE segments = rb_ensure(segments_from_samples_body, (VALUE)&args, release_samples, (VALUE)&parsed);
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
void init_ruby_whisper_vad_context(VALUE *mVAD)
|
||||
{
|
||||
cVADContext = rb_define_class_under(*mVAD, "Context", rb_cObject);
|
||||
rb_define_alloc_func(cVADContext, ruby_whisper_vad_context_s_allocate);
|
||||
rb_define_method(cVADContext, "initialize", ruby_whisper_vad_context_initialize, 1);
|
||||
rb_define_method(cVADContext, "segments_from_samples", ruby_whisper_vad_segments_from_samples, -1);
|
||||
rb_define_method(cVADContext, "detect", ruby_whisper_vad_detect, 2);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
#include "common-whisper.h"
|
||||
#include <string>
|
||||
|
|
@ -8,6 +7,8 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern ID id_to_path;
|
||||
|
||||
extern VALUE cVADSegments;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_vad_context_type;
|
||||
|
|
@ -25,12 +26,12 @@ ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) {
|
|||
std::vector<std::vector<float>> pcmf32s;
|
||||
whisper_vad_segments *segments;
|
||||
|
||||
TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
|
||||
if (rwvc->context == NULL) {
|
||||
rb_raise(rb_eRuntimeError, "Doesn't have referenxe to context internally");
|
||||
}
|
||||
GetVADContext(self, rwvc);
|
||||
TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
|
||||
|
||||
if (rb_respond_to(file_path, id_to_path)) {
|
||||
file_path = rb_funcall(file_path, id_to_path, 0);
|
||||
}
|
||||
cpp_file_path = StringValueCStr(file_path);
|
||||
|
||||
if (!read_audio_data(cpp_file_path, pcmf32, pcmf32s, false)) {
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define DEFINE_PARAM(param_name, nth) \
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 2
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
extern ID id___method__;
|
||||
|
|
|
|||
|
|
@ -17,6 +17,21 @@ module Whisper
|
|||
LOG_LEVEL_ERROR: Integer
|
||||
LOG_LEVEL_DEBUG: Integer
|
||||
LOG_LEVEL_CONT: Integer
|
||||
AHEADS_NONE: Integer
|
||||
AHEADS_N_TOP_MOST: Integer
|
||||
AHEADS_CUSTOM: Integer
|
||||
AHEADS_TINY_EN: Integer
|
||||
AHEADS_TINY: Integer
|
||||
AHEADS_BASE_EN: Integer
|
||||
AHEADS_BASE: Integer
|
||||
AHEADS_SMALL_EN: Integer
|
||||
AHEADS_SMALL: Integer
|
||||
AHEADS_MEDIUM_EN: Integer
|
||||
AHEADS_MEDIUM: Integer
|
||||
AHEADS_LARGE_V1: Integer
|
||||
AHEADS_LARGE_V2: Integer
|
||||
AHEADS_LARGE_V3: Integer
|
||||
AHEADS_LARGE_V3_TURBO: Integer
|
||||
|
||||
def self.lang_max_id: () -> Integer
|
||||
def self.lang_id: (string name) -> Integer
|
||||
|
|
@ -37,8 +52,8 @@ module Whisper
|
|||
# puts text
|
||||
# end
|
||||
#
|
||||
def transcribe: (string, Params, ?n_processors: Integer) -> self
|
||||
| (string, Params, ?n_processors: Integer) { (String) -> void } -> self
|
||||
def transcribe: (path, Params, ?n_processors: Integer) -> self
|
||||
| (path, Params, ?n_processors: Integer) { (String) -> void } -> self
|
||||
|
||||
def model_n_vocab: () -> Integer
|
||||
def model_n_audio_ctx: () -> Integer
|
||||
|
|
@ -120,6 +135,30 @@ module Whisper
|
|||
|
||||
def to_srt: () -> String
|
||||
def to_webvtt: () -> String
|
||||
|
||||
class Params
|
||||
def self.new: (
|
||||
use_gpu: boolish,
|
||||
flash_attn: boolish,
|
||||
gpu_device: Integer,
|
||||
dtw_token_timestamps: boolish,
|
||||
dtw_aheads_preset: Integer,
|
||||
dtw_n_top: Integer | nil,
|
||||
) -> instance
|
||||
|
||||
def use_gpu=: (boolish) -> boolish
|
||||
def use_gpu: () -> (true | false)
|
||||
def flash_attn=: (boolish) -> boolish
|
||||
def flash_attn: () -> (true | false)
|
||||
def gpu_device=: (Integer) -> Integer
|
||||
def gpu_device: () -> Integer
|
||||
def dtw_token_timestamps=: (boolish) -> boolish
|
||||
def dtw_token_timestamps: () -> (true | false)
|
||||
def dtw_aheads_preset=: (Integer) -> Integer
|
||||
def dtw_aheads_preset: () -> Integer
|
||||
def dtw_n_top=: (Integer | nil) -> (Integer | nil)
|
||||
def dtw_n_top: () -> (Integer | nil)
|
||||
end
|
||||
end
|
||||
|
||||
class Params
|
||||
|
|
@ -603,6 +642,8 @@ module Whisper
|
|||
|
||||
class Context
|
||||
def self.new: (String | path | ::URI::HTTP model_name_or_path) -> instance
|
||||
def segments_from_samples: (Params, Array[Float] samples, ?Integer n_samples) -> Segments
|
||||
| (Params, _Samples, ?Integer n_samples) -> Segments
|
||||
def detect: (path wav_file_path, Params) -> Segments
|
||||
end
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,82 @@
|
|||
require_relative "helper"
|
||||
|
||||
class TestContextParams < TestBase
|
||||
PARAM_NAMES = [
|
||||
:use_gpu,
|
||||
:flash_attn,
|
||||
:gpu_device,
|
||||
:dtw_token_timestamps,
|
||||
:dtw_aheads_preset,
|
||||
:dtw_n_top
|
||||
]
|
||||
|
||||
def test_new
|
||||
params = Whisper::Context::Params.new
|
||||
assert_instance_of Whisper::Context::Params, params
|
||||
end
|
||||
|
||||
def test_attributes
|
||||
params = Whisper::Context::Params.new
|
||||
|
||||
assert_true params.use_gpu
|
||||
params.use_gpu = false
|
||||
assert_false params.use_gpu
|
||||
|
||||
assert_true params.flash_attn
|
||||
params.flash_attn = false
|
||||
assert_false params.flash_attn
|
||||
|
||||
assert_equal 0, params.gpu_device
|
||||
params.gpu_device = 1
|
||||
assert_equal 1, params.gpu_device
|
||||
|
||||
assert_false params.dtw_token_timestamps
|
||||
params.dtw_token_timestamps = true
|
||||
assert_true params.dtw_token_timestamps
|
||||
|
||||
assert_equal Whisper::AHEADS_NONE, params.dtw_aheads_preset
|
||||
params.dtw_aheads_preset =Whisper::AHEADS_BASE
|
||||
assert_equal Whisper::AHEADS_BASE, params.dtw_aheads_preset
|
||||
|
||||
assert_nil params.dtw_n_top
|
||||
params.dtw_n_top = 6
|
||||
assert_equal 6, params.dtw_n_top
|
||||
params.dtw_n_top = nil
|
||||
assert_nil params.dtw_n_top
|
||||
end
|
||||
|
||||
def test_new_with_kw_args
|
||||
params = Whisper::Context::Params.new(use_gpu: false)
|
||||
assert_false params.use_gpu
|
||||
end
|
||||
|
||||
def test_new_with_kw_wargs_non_existent
|
||||
assert_raise ArgumentError do
|
||||
Whisper::Context::Params.new(non_existent: "value")
|
||||
end
|
||||
end
|
||||
|
||||
data(PARAM_NAMES.collect {|param| [param, param]}.to_h)
|
||||
def test_new_with_kw_args_default_values(param)
|
||||
default_params = Whisper::Context::Params.new
|
||||
default_value = default_params.send(param)
|
||||
value = if param == :dtw_n_top
|
||||
6
|
||||
else
|
||||
case default_value
|
||||
in true | false
|
||||
!default_value
|
||||
in Integer
|
||||
default_value + 1
|
||||
end
|
||||
end
|
||||
params = Whisper::Context::Params.new(param => value)
|
||||
assert_equal value, params.send(param)
|
||||
|
||||
PARAM_NAMES.reject {|name| name == param}.each do |name|
|
||||
expected = default_params.send(name)
|
||||
actual = params.send(name)
|
||||
assert_equal expected, actual
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -56,6 +56,17 @@ class TestToken < TestBase
|
|||
@segment.each_token.collect(&:text)
|
||||
end
|
||||
|
||||
def test_token_timestamps
|
||||
params = Whisper::Params.new(token_timestamps: true)
|
||||
whisper.transcribe(TestBase::AUDIO, params)
|
||||
prev = -1
|
||||
whisper.each_segment.first.each_token do |token|
|
||||
assert token.start_time >= prev
|
||||
assert token.end_time >= token.start_time
|
||||
prev = token.end_time
|
||||
end
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_with_nil
|
||||
keys = %i[id tid probability log_probability pt ptsum t_dtw voice_length start_time end_time text]
|
||||
expected = keys.collect {|key| [key, @token.send(key)] }.to_h
|
||||
|
|
|
|||
|
|
@ -9,6 +9,25 @@ class TestVADContext < TestBase
|
|||
def test_detect
|
||||
context = Whisper::VAD::Context.new("silero-v6.2.0")
|
||||
segments = context.detect(AUDIO, Whisper::VAD::Params.new)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_invalid_model_type
|
||||
assert_raise TypeError do
|
||||
Whisper::VAD::Context.new(Object.new)
|
||||
end
|
||||
end
|
||||
|
||||
def test_allocate
|
||||
vad = Whisper::VAD::Context.allocate
|
||||
assert_raise do
|
||||
vad.detect(AUDIO, Whisper::VAD::Params.new)
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def assert_segments(segments)
|
||||
assert_instance_of Whisper::VAD::Segments, segments
|
||||
|
||||
i = 0
|
||||
|
|
@ -35,16 +54,47 @@ class TestVADContext < TestBase
|
|||
assert_equal 4, segments.length
|
||||
end
|
||||
|
||||
def test_invalid_model_type
|
||||
assert_raise TypeError do
|
||||
Whisper::VAD::Context.new(Object.new)
|
||||
sub_test_case "from samples" do
|
||||
def setup
|
||||
super
|
||||
@vad = Whisper::VAD::Context.new("silero-v6.2.0")
|
||||
@samples = File.read(AUDIO, nil, 78).unpack("s<*").collect {|i| i.to_f / 2**15}
|
||||
end
|
||||
end
|
||||
|
||||
def test_allocate
|
||||
vad = Whisper::VAD::Context.allocate
|
||||
assert_raise do
|
||||
vad.detect(AUDIO, Whisper::VAD::Params.new)
|
||||
def test_segments_from_samples
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, @samples, @samples.length)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_segments_from_samples_without_length
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, @samples)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_segments_from_samples_enumerator
|
||||
samples = @samples.each
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, samples, @samples.length)
|
||||
assert_segments segments
|
||||
end
|
||||
|
||||
def test_segments_from_samples_enumerator_without_length
|
||||
samples = @samples.each
|
||||
assert_raise ArgumentError do
|
||||
@vad.segments_from_samples(Whisper::VAD::Params.new, samples)
|
||||
end
|
||||
end
|
||||
|
||||
def test_segments_from_samples_enumerator_with_too_large_length
|
||||
samples = @samples.each.take(10).to_enum
|
||||
assert_raise StopIteration do
|
||||
@vad.segments_from_samples(Whisper::VAD::Params.new, samples, 11)
|
||||
end
|
||||
end
|
||||
|
||||
def test_segments_from_samples_with_memory_view
|
||||
samples = JFKReader.new(AUDIO)
|
||||
segments = @vad.segments_from_samples(Whisper::VAD::Params.new, samples)
|
||||
assert_segments segments
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
require_relative "helper"
|
||||
require "stringio"
|
||||
require "etc"
|
||||
require "pathname"
|
||||
|
||||
# Exists to detect memory-related bug
|
||||
Whisper.log_set ->(level, buffer, user_data) {}, nil
|
||||
|
|
@ -20,6 +21,15 @@ class TestWhisper < TestBase
|
|||
}
|
||||
end
|
||||
|
||||
def test_whisper_pathname
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
||||
@whisper.transcribe(Pathname(AUDIO), params) {|text|
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
|
||||
}
|
||||
end
|
||||
|
||||
def test_transcribe_non_parallel
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
params = Whisper::Params.new
|
||||
|
|
@ -207,6 +217,16 @@ class TestWhisper < TestBase
|
|||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
end
|
||||
|
||||
def test_full_with_memroy_view_gc
|
||||
samples = JFKReader.new(AUDIO)
|
||||
@whisper.full(@params, samples)
|
||||
GC.start
|
||||
require "fiddle"
|
||||
Fiddle::MemoryView.export samples do |view|
|
||||
assert_equal 176000, view.to_s.unpack("#{view.format}*").length
|
||||
end
|
||||
end
|
||||
|
||||
def test_full_parallel
|
||||
nprocessors = 2
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ require_relative "extsources"
|
|||
Gem::Specification.new do |s|
|
||||
s.name = "whispercpp"
|
||||
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
||||
s.version = '1.3.5'
|
||||
s.version = '1.3.6'
|
||||
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
||||
s.email = 'todd.fisher@gmail.com'
|
||||
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
||||
|
|
|
|||
|
|
@ -559,7 +559,7 @@ xcodebuild -create-xcframework \
|
|||
-framework $(pwd)/build-ios-device/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-ios-device/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-macos/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMS/whisper.dSYM \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-visionos/framework/whisper.framework \
|
||||
-debug-symbols $(pwd)/build-visionos/dSYMs/whisper.dSYM \
|
||||
-framework $(pwd)/build-visionos-sim/framework/whisper.framework \
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ jobs:
|
|||
issues: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
- uses: actions/stale@v10
|
||||
with:
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
||||
days-before-issue-stale: 30
|
||||
|
|
|
|||
|
|
@ -3,60 +3,25 @@ set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
|
|||
set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
|
||||
set(WHISPER_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
|
||||
set(GGML_BLAS @GGML_BLAS@)
|
||||
set(GGML_CUDA @GGML_CUDA@)
|
||||
set(GGML_METAL @GGML_METAL@)
|
||||
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(WHISPER_LIB_DIR "@PACKAGE_WHISPER_LIB_INSTALL_DIR@")
|
||||
set_and_check(WHISPER_BIN_DIR "@PACKAGE_WHISPER_BIN_INSTALL_DIR@")
|
||||
|
||||
# Ensure transient dependencies satisfied
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_HIPBLAS)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
endif()
|
||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
||||
|
||||
find_library(whisper_LIBRARY whisper
|
||||
REQUIRED
|
||||
HINTS ${WHISPER_LIB_DIR})
|
||||
|
||||
set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@")
|
||||
set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@")
|
||||
HINTS ${WHISPER_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH
|
||||
)
|
||||
|
||||
add_library(whisper UNKNOWN IMPORTED)
|
||||
|
||||
set_target_properties(whisper
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}"
|
||||
INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}"
|
||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${whisper_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ struct whisper_params {
|
|||
bool log_score = false;
|
||||
bool use_gpu = true;
|
||||
bool flash_attn = true;
|
||||
int32_t gpu_device = 0;
|
||||
bool suppress_nst = false;
|
||||
bool carry_initial_prompt = false;
|
||||
|
||||
|
|
@ -129,6 +130,10 @@ static char * requires_value_error(const std::string & arg) {
|
|||
}
|
||||
|
||||
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
if (const char * env_device = std::getenv("WHISPER_ARG_DEVICE")) {
|
||||
params.gpu_device = std::stoi(env_device);
|
||||
}
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
|
|
@ -195,6 +200,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
|||
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = ARGV_NEXT; }
|
||||
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
|
||||
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
||||
else if (arg == "-dev" || arg == "--device") { params.gpu_device = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
|
||||
else if (arg == "-nfa" || arg == "--no-flash-attn") { params.flash_attn = false; }
|
||||
else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
|
||||
|
|
@ -276,6 +282,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
|||
fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
|
||||
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
|
||||
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
||||
fprintf(stderr, " -dev N, --device N [%-7d] GPU device ID (default: 0)\n", params.gpu_device);
|
||||
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
||||
fprintf(stderr, " -nfa, --no-flash-attn [%-7s] disable flash attention\n", params.flash_attn ? "false" : "true");
|
||||
fprintf(stderr, " -sns, --suppress-nst [%-7s] suppress non-speech tokens\n", params.suppress_nst ? "true" : "false");
|
||||
|
|
@ -1003,6 +1010,7 @@ int main(int argc, char ** argv) {
|
|||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
cparams.gpu_device = params.gpu_device;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
|
||||
if (!params.dtw.empty()) {
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ bool ggml_common_quantize_0(
|
|||
case GGML_FTYPE_MOSTLY_IQ1_M:
|
||||
case GGML_FTYPE_MOSTLY_BF16:
|
||||
case GGML_FTYPE_MOSTLY_MXFP4:
|
||||
case GGML_FTYPE_MOSTLY_NVFP4:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||
return false;
|
||||
|
|
@ -213,6 +214,7 @@ bool ggml_common_quantize_0(
|
|||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
||||
|
|
|
|||
6910
examples/miniaudio.h
6910
examples/miniaudio.h
File diff suppressed because it is too large
Load Diff
|
|
@ -103,6 +103,7 @@ struct whisper_params {
|
|||
bool no_timestamps = false;
|
||||
bool use_gpu = true;
|
||||
bool flash_attn = true;
|
||||
int32_t gpu_device = 0;
|
||||
bool suppress_nst = false;
|
||||
bool no_context = true;
|
||||
bool no_language_probabilities = false;
|
||||
|
|
@ -179,6 +180,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|||
fprintf(stderr, " -sns, --suppress-nst [%-7s] suppress non-speech tokens\n", params.suppress_nst ? "true" : "false");
|
||||
fprintf(stderr, " -nth N, --no-speech-thold N [%-7.2f] no speech threshold\n", params.no_speech_thold);
|
||||
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
|
||||
fprintf(stderr, " -dev N, --device N [%-7d] GPU device ID (default: 0)\n", params.gpu_device);
|
||||
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
||||
fprintf(stderr, " -nfa, --no-flash-attn [%-7s] disable flash attention\n", params.flash_attn ? "false" : "true");
|
||||
fprintf(stderr, " -nlp, --no-language-probabilities [%-7s] exclude language probabilities from verbose_json output\n", params.no_language_probabilities ? "true" : "false");
|
||||
|
|
@ -198,6 +200,10 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|||
}
|
||||
|
||||
bool whisper_params_parse(int argc, char ** argv, whisper_params & params, server_params & sparams) {
|
||||
if (const char * env_device = std::getenv("WHISPER_ARG_DEVICE")) {
|
||||
params.gpu_device = std::stoi(env_device);
|
||||
}
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
|
|
@ -237,6 +243,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|||
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
||||
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; }
|
||||
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
||||
else if (arg == "-dev" || arg == "--device") { params.gpu_device = std::stoi(argv[++i]); }
|
||||
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
|
||||
else if (arg == "-nfa" || arg == "--no-flash-attn") { params.flash_attn = false; }
|
||||
else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
|
||||
|
|
@ -643,6 +650,7 @@ int main(int argc, char ** argv) {
|
|||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
cparams.gpu_device = params.gpu_device;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
|
||||
if (!params.dtw.empty()) {
|
||||
|
|
@ -740,9 +748,9 @@ int main(int argc, char ** argv) {
|
|||
<body>
|
||||
<h1>Whisper.cpp Server</h1>
|
||||
|
||||
<h2>/inference</h2>
|
||||
<h2>)" + sparams.request_path + sparams.inference_path + R"(</h2>
|
||||
<pre>
|
||||
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
|
||||
curl 127.0.0.1:)" + std::to_string(sparams.port) + sparams.request_path + sparams.inference_path + R"( \
|
||||
-H "Content-Type: multipart/form-data" \
|
||||
-F file="@<file-path>" \
|
||||
-F temperature="0.0" \
|
||||
|
|
@ -759,7 +767,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
<div>
|
||||
<h2>Try it out</h2>
|
||||
<form action="/inference" method="POST" enctype="multipart/form-data">
|
||||
<form action=")" + sparams.request_path + sparams.inference_path + R"(" method="POST" enctype="multipart/form-data">
|
||||
<label for="file">Choose an audio file:</label>
|
||||
<input type="file" id="file" name="file" accept="audio/*" required><br>
|
||||
|
||||
|
|
@ -803,6 +811,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
fprintf(stderr, "error: no 'file' field in the request\n");
|
||||
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
|
||||
res.status = 400;
|
||||
res.set_content(error_resp, "application/json");
|
||||
return;
|
||||
}
|
||||
|
|
@ -829,6 +838,7 @@ int main(int argc, char ** argv) {
|
|||
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
|
||||
const bool is_converted = convert_to_wav(temp_filename, error_resp);
|
||||
if (!is_converted) {
|
||||
res.status = 500;
|
||||
res.set_content(error_resp, "application/json");
|
||||
return;
|
||||
}
|
||||
|
|
@ -838,6 +848,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
|
||||
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
|
||||
res.status = 400;
|
||||
res.set_content(error_resp, "application/json");
|
||||
std::remove(temp_filename.c_str());
|
||||
return;
|
||||
|
|
@ -849,6 +860,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
fprintf(stderr, "error: failed to read audio data\n");
|
||||
const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
|
||||
res.status = 400;
|
||||
res.set_content(error_resp, "application/json");
|
||||
return;
|
||||
}
|
||||
|
|
@ -927,7 +939,7 @@ int main(int argc, char ** argv) {
|
|||
wparams.logprob_thold = params.logprob_thold;
|
||||
|
||||
wparams.no_timestamps = params.no_timestamps;
|
||||
wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
|
||||
wparams.token_timestamps = !params.no_timestamps;
|
||||
wparams.no_context = params.no_context;
|
||||
|
||||
wparams.suppress_nst = params.suppress_nst;
|
||||
|
|
@ -1119,6 +1131,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
fprintf(stderr, "error: no 'model' field in the request\n");
|
||||
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
|
||||
res.status = 400;
|
||||
res.set_content(error_resp, "application/json");
|
||||
return;
|
||||
}
|
||||
|
|
@ -1127,6 +1140,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
|
||||
const std::string error_resp = "{\"error\":\"model not found!\"}";
|
||||
res.status = 400;
|
||||
res.set_content(error_resp, "application/json");
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,18 +22,19 @@ if (WHISPER_SDL2)
|
|||
llama-kv-cache-iswa.cpp
|
||||
llama-memory-recurrent.cpp
|
||||
llama-memory-hybrid.cpp
|
||||
llama-memory-hybrid-iswa.cpp
|
||||
llama-memory.cpp
|
||||
llama-mmap.cpp
|
||||
llama-model-loader.cpp
|
||||
llama-model-saver.cpp
|
||||
llama-model.cpp
|
||||
llama-quant.cpp
|
||||
llama-sampling.cpp
|
||||
llama-sampler.cpp
|
||||
llama-vocab.cpp
|
||||
unicode.cpp
|
||||
unicode-data.cpp
|
||||
${SRC_MODELS})
|
||||
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
||||
target_include_directories(${TARGET} PRIVATE . ${SDL2_INCLUDE_DIRS})
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
|
|
|||
|
|
@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
|
||||
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
llama_model & model = adapter.model;
|
||||
|
||||
ggml_context * ctx_init;
|
||||
gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ true,
|
||||
|
|
@ -413,17 +411,17 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
|
|||
}
|
||||
}
|
||||
|
||||
// update number of nodes used
|
||||
model.n_lora_nodes += adapter.get_n_nodes();
|
||||
// register adapter with model
|
||||
model.loras.insert(&adapter);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora(*model);
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
|
||||
try {
|
||||
llama_adapter_lora_init_impl(path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
|
|
@ -473,12 +471,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
|||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||
// update number of nodes used
|
||||
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
|
||||
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
|
||||
|
||||
delete adapter;
|
||||
void llama_adapter_lora_free(llama_adapter_lora *) {
|
||||
// deprecated: adapters are freed by llama_model's destructor
|
||||
}
|
||||
|
||||
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
||||
|
|
|
|||
|
|
@ -39,6 +39,8 @@ private:
|
|||
std::vector<ggml_tensor *> tensors; // per layer
|
||||
};
|
||||
|
||||
using llama_adapter_cvec_ptr = std::shared_ptr<llama_adapter_cvec>;
|
||||
|
||||
//
|
||||
// llama_adapter_lora
|
||||
//
|
||||
|
|
@ -59,8 +61,6 @@ struct llama_adapter_lora_weight {
|
|||
};
|
||||
|
||||
struct llama_adapter_lora {
|
||||
llama_model & model;
|
||||
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
|
||||
|
|
@ -75,7 +75,7 @@ struct llama_adapter_lora {
|
|||
// activated lora (aLoRA)
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
|
||||
llama_adapter_lora(llama_model & model) : model(model) {}
|
||||
llama_adapter_lora() = default;
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
||||
|
|
@ -86,3 +86,4 @@ struct llama_adapter_lora {
|
|||
};
|
||||
|
||||
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|
||||
using llama_adapter_loras_ptr = std::unique_ptr<llama_adapter_loras>;
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
|
||||
|
|
@ -26,6 +27,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
|
||||
{ LLM_ARCH_EUROBERT, "eurobert" },
|
||||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
{ LLM_ARCH_QWEN, "qwen" },
|
||||
|
|
@ -37,6 +39,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
|
||||
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
|
||||
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
|
||||
{ LLM_ARCH_QWEN35, "qwen35" },
|
||||
{ LLM_ARCH_QWEN35MOE, "qwen35moe" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
|
|
@ -72,15 +76,18 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_CHATGLM, "chatglm" },
|
||||
{ LLM_ARCH_GLM4, "glm4" },
|
||||
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
|
||||
{ LLM_ARCH_GLM_DSA, "glm-dsa" },
|
||||
{ LLM_ARCH_BITNET, "bitnet" },
|
||||
{ LLM_ARCH_T5, "t5" },
|
||||
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_JAIS2, "jais2" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||
{ LLM_ARCH_EXAONE_MOE, "exaone-moe" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
||||
{ LLM_ARCH_RWKV7, "rwkv7" },
|
||||
|
|
@ -116,9 +123,12 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_STEP35, "step35" },
|
||||
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
|
||||
{ LLM_ARCH_MAINCODER, "maincoder" },
|
||||
{ LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
|
|
@ -160,6 +170,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
|
||||
{ LLM_KV_SWIGLU_CLAMP_EXP, "%s.swiglu_clamp_exp" },
|
||||
{ LLM_KV_SWIGLU_CLAMP_SHEXP, "%s.swiglu_clamp_shexp" },
|
||||
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
||||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
|
|
@ -173,6 +185,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
|
||||
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
|
|
@ -190,6 +203,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
||||
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
|
||||
{ LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
|
|
@ -217,22 +231,28 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
|
||||
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
|
||||
|
||||
{ LLM_KV_SPLIT_NO, "split.no" },
|
||||
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
||||
|
|
@ -245,6 +265,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
|
||||
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
||||
|
||||
{ LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" },
|
||||
|
||||
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
||||
|
||||
{ LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
|
||||
|
|
@ -332,6 +354,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
|
|
@ -343,6 +366,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
{ LLM_TENSOR_FFN_LATENT_DOWN, "blk.%d.ffn_latent_down" },
|
||||
{ LLM_TENSOR_FFN_LATENT_UP, "blk.%d.ffn_latent_up" },
|
||||
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
|
|
@ -353,12 +378,14 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||
{ LLM_TENSOR_CLS, "cls" },
|
||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||
{ LLM_TENSOR_CLS_NORM, "cls.norm" },
|
||||
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
|
||||
{ LLM_TENSOR_SSM_ALPHA, "blk.%d.ssm_alpha" },
|
||||
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
|
|
@ -370,6 +397,15 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
|
||||
{ LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
|
||||
{ LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
|
||||
{ LLM_TENSOR_SSM_CONV1D_Q, "blk.%d.ssm_conv1d_q" },
|
||||
{ LLM_TENSOR_SSM_CONV1D_K, "blk.%d.ssm_conv1d_k" },
|
||||
{ LLM_TENSOR_SSM_CONV1D_V, "blk.%d.ssm_conv1d_v" },
|
||||
{ LLM_TENSOR_SSM_F_A, "blk.%d.ssm_f_a" },
|
||||
{ LLM_TENSOR_SSM_F_B, "blk.%d.ssm_f_b" },
|
||||
{ LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" },
|
||||
{ LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" },
|
||||
{ LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" },
|
||||
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
||||
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
||||
|
|
@ -496,6 +532,10 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
|
||||
{ LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
|
||||
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
||||
{ LLM_TENSOR_INDEXER_K_NORM, "blk.%d.indexer.k_norm" },
|
||||
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
|
||||
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
|
||||
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
|
||||
};
|
||||
|
||||
static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
|
|
@ -709,6 +749,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
|
|
@ -787,6 +828,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
};
|
||||
case LLM_ARCH_EUROBERT:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
};
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
|
@ -800,6 +855,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_CLS_NORM,
|
||||
};
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
return {
|
||||
|
|
@ -952,11 +1008,11 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_GATE,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
|
|
@ -969,6 +1025,64 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_SSM_NORM,
|
||||
LLM_TENSOR_SSM_OUT,
|
||||
};
|
||||
case LLM_ARCH_QWEN35:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_GATE,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_SSM_A_NOSCAN,
|
||||
LLM_TENSOR_SSM_CONV1D,
|
||||
LLM_TENSOR_SSM_DT,
|
||||
LLM_TENSOR_SSM_BETA,
|
||||
LLM_TENSOR_SSM_ALPHA,
|
||||
LLM_TENSOR_SSM_NORM,
|
||||
LLM_TENSOR_SSM_OUT,
|
||||
};
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_GATE,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
LLM_TENSOR_SSM_A_NOSCAN,
|
||||
LLM_TENSOR_SSM_CONV1D,
|
||||
LLM_TENSOR_SSM_DT,
|
||||
LLM_TENSOR_SSM_BETA,
|
||||
LLM_TENSOR_SSM_ALPHA,
|
||||
LLM_TENSOR_SSM_NORM,
|
||||
LLM_TENSOR_SSM_OUT,
|
||||
};
|
||||
case LLM_ARCH_QWEN3VL:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
|
|
@ -976,6 +1090,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
|
|
@ -1497,6 +1612,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
|
|
@ -1549,6 +1665,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_FFN_POST_NORM,
|
||||
LLM_TENSOR_NEXTN_EH_PROJ,
|
||||
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
||||
LLM_TENSOR_NEXTN_ENORM,
|
||||
LLM_TENSOR_NEXTN_HNORM,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
};
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
return {
|
||||
|
|
@ -1581,6 +1703,46 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
};
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_A,
|
||||
LLM_TENSOR_ATTN_Q_B,
|
||||
LLM_TENSOR_ATTN_KV_A_MQA,
|
||||
LLM_TENSOR_ATTN_KV_B,
|
||||
LLM_TENSOR_ATTN_K_B,
|
||||
LLM_TENSOR_ATTN_V_B,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_INDEXER_K_NORM,
|
||||
LLM_TENSOR_INDEXER_PROJ,
|
||||
LLM_TENSOR_INDEXER_ATTN_K,
|
||||
LLM_TENSOR_INDEXER_ATTN_Q_B,
|
||||
LLM_TENSOR_NEXTN_EH_PROJ,
|
||||
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
||||
LLM_TENSOR_NEXTN_ENORM,
|
||||
LLM_TENSOR_NEXTN_HNORM,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
};
|
||||
case LLM_ARCH_BITNET:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
|
@ -1659,6 +1821,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
};
|
||||
case LLM_ARCH_JAIS2:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
};
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
|
@ -1706,6 +1882,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_FFN_LATENT_DOWN,
|
||||
LLM_TENSOR_FFN_LATENT_UP,
|
||||
// MoE shared expert layer
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
|
|
@ -1728,6 +1906,38 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_POST_NORM,
|
||||
};
|
||||
case LLM_ARCH_EXAONE_MOE:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_NEXTN_EH_PROJ,
|
||||
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
||||
LLM_TENSOR_NEXTN_ENORM,
|
||||
LLM_TENSOR_NEXTN_HNORM,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
};
|
||||
case LLM_ARCH_RWKV6:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
|
@ -2234,6 +2444,35 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
};
|
||||
case LLM_ARCH_STEP35:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ROPE_FACTORS_LONG,
|
||||
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_GATE,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
};
|
||||
case LLM_ARCH_GPTJ:
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
return {
|
||||
|
|
@ -2256,6 +2495,54 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
// Dense FFN (layer 0 only)
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
// MoE FFN (layers 1+)
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
// Shared experts
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
// KDA (using SSM_ enum prefix, keeping GGUF names for backward compat)
|
||||
LLM_TENSOR_SSM_CONV1D_Q,
|
||||
LLM_TENSOR_SSM_CONV1D_K,
|
||||
LLM_TENSOR_SSM_CONV1D_V,
|
||||
LLM_TENSOR_SSM_F_A,
|
||||
LLM_TENSOR_SSM_F_B,
|
||||
LLM_TENSOR_SSM_BETA,
|
||||
LLM_TENSOR_SSM_A,
|
||||
LLM_TENSOR_SSM_G_A,
|
||||
LLM_TENSOR_SSM_G_B,
|
||||
LLM_TENSOR_SSM_DT,
|
||||
LLM_TENSOR_SSM_NORM,
|
||||
// MLA
|
||||
LLM_TENSOR_ATTN_Q_A,
|
||||
LLM_TENSOR_ATTN_Q_B,
|
||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||
LLM_TENSOR_ATTN_KV_A_MQA,
|
||||
LLM_TENSOR_ATTN_KV_B,
|
||||
LLM_TENSOR_ATTN_K_B,
|
||||
LLM_TENSOR_ATTN_V_B,
|
||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||
};
|
||||
default:
|
||||
GGML_ABORT("unknown architecture for tensor mapping");
|
||||
}
|
||||
|
|
@ -2279,6 +2566,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
|
|
@ -2331,6 +2619,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
|
|
@ -2359,6 +2648,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
// Kimi KDA - Conv tensors are 4D [d_conv, 1, d_inner, 1], reshaped to 2D at runtime
|
||||
{LLM_TENSOR_SSM_CONV1D_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_CONV1D_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_CONV1D_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_F_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_F_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
|
@ -2401,6 +2699,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
|
|
@ -2448,6 +2747,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_INDEXER_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
||||
// These tensors only exist in the last layer(s) and are treated as output tensors
|
||||
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
|
|
@ -2456,6 +2759,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
// Nemotron 3 Super
|
||||
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
};
|
||||
|
||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||
|
|
@ -2493,6 +2799,15 @@ std::string LLM_TN_IMPL::str() const {
|
|||
return name;
|
||||
}
|
||||
|
||||
std::vector<llm_arch> llm_arch_all() {
|
||||
std::vector<llm_arch> ret;
|
||||
ret.reserve(LLM_ARCH_NAMES.size());
|
||||
for (const auto & [arch, _] : LLM_ARCH_NAMES) {
|
||||
ret.push_back(arch);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char * llm_arch_name(llm_arch arch) {
|
||||
auto it = LLM_ARCH_NAMES.find(arch);
|
||||
if (it == LLM_ARCH_NAMES.end()) {
|
||||
|
|
@ -2540,6 +2855,9 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
case LLM_ARCH_QWEN35:
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// gguf constants (sync with gguf.py)
|
||||
|
|
@ -30,6 +31,7 @@ enum llm_arch {
|
|||
LLM_ARCH_NEO_BERT,
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
LLM_ARCH_JINA_BERT_V3,
|
||||
LLM_ARCH_EUROBERT,
|
||||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
LLM_ARCH_QWEN,
|
||||
|
|
@ -41,6 +43,8 @@ enum llm_arch {
|
|||
LLM_ARCH_QWEN3NEXT,
|
||||
LLM_ARCH_QWEN3VL,
|
||||
LLM_ARCH_QWEN3VLMOE,
|
||||
LLM_ARCH_QWEN35,
|
||||
LLM_ARCH_QWEN35MOE,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PHIMOE,
|
||||
|
|
@ -76,15 +80,18 @@ enum llm_arch {
|
|||
LLM_ARCH_CHATGLM,
|
||||
LLM_ARCH_GLM4,
|
||||
LLM_ARCH_GLM4_MOE,
|
||||
LLM_ARCH_GLM_DSA,
|
||||
LLM_ARCH_BITNET,
|
||||
LLM_ARCH_T5,
|
||||
LLM_ARCH_T5ENCODER,
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_JAIS2,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_NEMOTRON_H,
|
||||
LLM_ARCH_NEMOTRON_H_MOE,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_EXAONE4,
|
||||
LLM_ARCH_EXAONE_MOE,
|
||||
LLM_ARCH_RWKV6,
|
||||
LLM_ARCH_RWKV6QWEN2,
|
||||
LLM_ARCH_RWKV7,
|
||||
|
|
@ -120,9 +127,12 @@ enum llm_arch {
|
|||
LLM_ARCH_RND1,
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
LLM_ARCH_MISTRAL3,
|
||||
LLM_ARCH_PADDLEOCR,
|
||||
LLM_ARCH_MIMO2,
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_LLAMA_EMBED,
|
||||
LLM_ARCH_MAINCODER,
|
||||
LLM_ARCH_KIMI_LINEAR,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
@ -164,6 +174,8 @@ enum llm_kv {
|
|||
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_SWIGLU_CLAMP_EXP,
|
||||
LLM_KV_SWIGLU_CLAMP_SHEXP,
|
||||
LLM_KV_USE_PARALLEL_RESIDUAL,
|
||||
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
|
|
@ -177,6 +189,7 @@ enum llm_kv {
|
|||
LLM_KV_EXPERT_GROUP_SCALE,
|
||||
LLM_KV_EXPERTS_PER_GROUP,
|
||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||
LLM_KV_MOE_LATENT_SIZE,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
|
|
@ -194,6 +207,7 @@ enum llm_kv {
|
|||
LLM_KV_EMBEDDING_SCALE,
|
||||
LLM_KV_TOKEN_SHIFT_COUNT,
|
||||
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
|
||||
LLM_KV_FULL_ATTENTION_INTERVAL,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
|
|
@ -221,8 +235,14 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_SWA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
|
||||
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_FREQ_BASE_SWA,
|
||||
|
|
@ -249,6 +269,8 @@ enum llm_kv {
|
|||
LLM_KV_SSM_GROUP_COUNT,
|
||||
LLM_KV_SSM_DT_B_C_RMS,
|
||||
|
||||
LLM_KV_KDA_HEAD_DIM,
|
||||
|
||||
LLM_KV_WKV_HEAD_SIZE,
|
||||
|
||||
LLM_KV_TOKENIZER_MODEL,
|
||||
|
|
@ -356,6 +378,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
|
|
@ -363,6 +386,8 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_GATE_CHEXPS,
|
||||
LLM_TENSOR_FFN_UP_CHEXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_FFN_LATENT_DOWN,
|
||||
LLM_TENSOR_FFN_LATENT_UP,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_LAYER_OUT_NORM,
|
||||
|
|
@ -397,6 +422,16 @@ enum llm_tensor {
|
|||
LLM_TENSOR_SSM_NORM,
|
||||
LLM_TENSOR_SSM_OUT,
|
||||
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
|
||||
LLM_TENSOR_SSM_ALPHA, // qwen3.5
|
||||
// Kimi Linear KDA (using SSM_ prefix for consistency)
|
||||
LLM_TENSOR_SSM_CONV1D_Q, // kimi: Q conv1d weight
|
||||
LLM_TENSOR_SSM_CONV1D_K, // kimi: K conv1d weight
|
||||
LLM_TENSOR_SSM_CONV1D_V, // kimi: V conv1d weight
|
||||
LLM_TENSOR_SSM_F_A, // kimi: forget gate projection A
|
||||
LLM_TENSOR_SSM_F_B, // kimi: forget gate projection B
|
||||
LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient and qwen3.5
|
||||
LLM_TENSOR_SSM_G_A, // kimi: output gate projection A
|
||||
LLM_TENSOR_SSM_G_B, // kimi: output gate projection B
|
||||
LLM_TENSOR_TIME_MIX_W0,
|
||||
LLM_TENSOR_TIME_MIX_W1,
|
||||
LLM_TENSOR_TIME_MIX_W2,
|
||||
|
|
@ -473,6 +508,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_CLS_NORM,
|
||||
LLM_TENSOR_CONV1D,
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
|
|
@ -497,6 +533,10 @@ enum llm_tensor {
|
|||
LLM_TENSOR_VISEXP_FFN_GATE,
|
||||
LLM_TENSOR_VISEXP_FFN_DOWN,
|
||||
LLM_TENSOR_VISEXP_FFN_UP,
|
||||
LLM_TENSOR_INDEXER_K_NORM,
|
||||
LLM_TENSOR_INDEXER_PROJ,
|
||||
LLM_TENSOR_INDEXER_ATTN_K,
|
||||
LLM_TENSOR_INDEXER_ATTN_Q_B,
|
||||
LLM_TENSOR_NEXTN_EH_PROJ,
|
||||
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
||||
LLM_TENSOR_NEXTN_ENORM,
|
||||
|
|
@ -575,6 +615,8 @@ struct llm_tensor_info {
|
|||
ggml_op op;
|
||||
};
|
||||
|
||||
std::vector<llm_arch> llm_arch_all();
|
||||
|
||||
const char * llm_arch_name(llm_arch arch);
|
||||
|
||||
llm_arch llm_arch_from_string(const std::string & name);
|
||||
|
|
|
|||
|
|
@ -394,11 +394,13 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
|
|||
clear();
|
||||
split_reset();
|
||||
|
||||
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
|
||||
|
||||
auto udata = std::make_shared<llama_ubatch::data_t>();
|
||||
|
||||
udata->token .resize(n_tokens);
|
||||
udata->embd .clear();
|
||||
udata->pos .resize(n_tokens);
|
||||
udata->pos .resize(n_pos_all);
|
||||
udata->n_seq_id .resize(n_tokens);
|
||||
udata->seq_id .resize(n_tokens);
|
||||
udata->seq_id_unq.resize(0);
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
|
||||
{ "exaone-moe", LLM_CHAT_TEMPLATE_EXAONE_MOE },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||
|
|
@ -137,6 +138,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||
if (tmpl_contains("<|tool_declare|>")) {
|
||||
return LLM_CHAT_TEMPLATE_EXAONE_MOE;
|
||||
}
|
||||
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
|
|
@ -229,7 +233,7 @@ int32_t llm_chat_apply_template(
|
|||
llm_chat_template tmpl,
|
||||
const std::vector<const llama_chat_message *> & chat,
|
||||
std::string & dest, bool add_ass) {
|
||||
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
||||
// Taken from the research: https://github.com/ggml-org/llama.cpp/issues/5527
|
||||
std::stringstream ss;
|
||||
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
|
||||
// chatml template
|
||||
|
|
@ -576,6 +580,22 @@ int32_t llm_chat_apply_template(
|
|||
if (add_ass) {
|
||||
ss << "[|assistant|]";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_MOE) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << "<|system|>\n" << trim(message->content) << "<|endofturn|>\n";
|
||||
} else if (role == "user") {
|
||||
ss << "<|user|>\n" << trim(message->content) << "<|endofturn|>\n";
|
||||
} else if (role == "assistant") {
|
||||
ss << "<|assistant|>\n" << trim(message->content) << "<|endofturn|>\n";
|
||||
} else if (role == "tool") {
|
||||
ss << "<|tool|>\n" << trim(message->content) << "<|endofturn|>\n";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
|
||||
// this template requires the model to have "\n\n" as EOT token
|
||||
for (size_t i = 0; i < chat.size(); i++) {
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ enum llm_chat_template {
|
|||
LLM_CHAT_TEMPLATE_MINICPM,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_4,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_MOE,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
LLM_CHAT_TEMPLATE_GRANITE,
|
||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -4,6 +4,7 @@
|
|||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
#include "ggml-opt.h"
|
||||
|
|
@ -40,6 +41,14 @@ struct llama_context {
|
|||
|
||||
~llama_context();
|
||||
|
||||
// reserve a new backend scheduler (if needed)
|
||||
// for example, when:
|
||||
// - changing loras
|
||||
// - changing samplers
|
||||
// - changing attention type
|
||||
// - etc.
|
||||
void sched_reserve();
|
||||
|
||||
void synchronize();
|
||||
|
||||
const llama_model & get_model() const;
|
||||
|
|
@ -96,16 +105,11 @@ struct llama_context {
|
|||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
|
||||
void set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale);
|
||||
void set_adapters_lora(llama_adapter_lora ** adapters, size_t n_adapters, float * scales);
|
||||
|
||||
bool rm_adapter_lora(
|
||||
llama_adapter_lora * adapter);
|
||||
bool adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales);
|
||||
|
||||
void clear_adapter_lora();
|
||||
|
||||
bool apply_adapter_cvec(
|
||||
bool set_adapter_cvec(
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
|
|
@ -204,7 +208,7 @@ private:
|
|||
|
||||
// Make sure enough space is available for outputs.
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
|
||||
uint32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
void output_reorder();
|
||||
|
||||
|
|
@ -252,43 +256,36 @@ private:
|
|||
|
||||
const llama_model & model;
|
||||
|
||||
llama_cparams cparams;
|
||||
llama_adapter_cvec cvec;
|
||||
llama_adapter_loras loras;
|
||||
llama_cparams cparams;
|
||||
|
||||
llama_adapter_cvec_ptr cvec;
|
||||
llama_adapter_loras_ptr loras;
|
||||
|
||||
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
||||
|
||||
std::unique_ptr<llama_memory_i> memory;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * logits = nullptr;
|
||||
buffer_view<float> logits = {nullptr, 0};
|
||||
|
||||
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
||||
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
||||
size_t embd_size = 0; // capacity (of floats) for embeddings
|
||||
float * embd = nullptr;
|
||||
buffer_view<float> embd = {nullptr, 0};
|
||||
|
||||
// TODO: simplify
|
||||
struct sampling_info {
|
||||
// !samplers.empty() to check if any samplers are active
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
|
||||
float * logits = nullptr;
|
||||
size_t logits_size = 0;
|
||||
|
||||
llama_token * sampled = nullptr;
|
||||
size_t sampled_size = 0;
|
||||
|
||||
float * probs = nullptr;
|
||||
size_t probs_size = 0;
|
||||
|
||||
llama_token * candidates = nullptr;
|
||||
size_t candidates_size = 0;
|
||||
buffer_view<float> logits = {nullptr, 0};
|
||||
buffer_view<llama_token> sampled = {nullptr, 0};
|
||||
buffer_view<float> probs = {nullptr, 0};
|
||||
buffer_view<llama_token> candidates = {nullptr, 0};
|
||||
|
||||
std::vector<uint32_t> logits_count;
|
||||
std::vector<uint32_t> probs_count;
|
||||
std::vector<uint32_t> candidates_count;
|
||||
|
||||
// optimization
|
||||
std::vector<llama_token> token_ids_full_vocab;
|
||||
};
|
||||
|
||||
|
|
@ -314,6 +311,8 @@ private:
|
|||
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
bool sched_need_reserve = true;
|
||||
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
|
||||
|
|
|
|||
|
|
@ -30,10 +30,15 @@ struct llama_cparams {
|
|||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool auto_fa;
|
||||
bool fused_gdn_ar; // use fused gated delta net (autoregressive)
|
||||
bool fused_gdn_ch; // use fused gated delta net (chunked)
|
||||
bool auto_fgdn;
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama-context.h"
|
||||
#include "ggml.h"
|
||||
#include "stdint.h"
|
||||
|
||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
struct llama_context * ctx,
|
||||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs);
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-sampling.h"
|
||||
#include "llama-sampler.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
|
@ -601,7 +601,7 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||
}
|
||||
const char * int_end = parse_int(pos);
|
||||
uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
|
||||
uint64_t min_times = std::stoull(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
|
||||
uint64_t max_times = UINT64_MAX; // default: no max limit
|
||||
|
|
@ -614,7 +614,7 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
|
||||
if (is_digit_char(*pos)) {
|
||||
const char * int_end = parse_int(pos);
|
||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||
max_times = std::stoull(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
}
|
||||
|
||||
|
|
@ -1160,13 +1160,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
// if there is a grammar, parse it
|
||||
// rules will be empty (default) if there are parse errors
|
||||
if (!parser.parse(grammar_str) || parser.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
LLAMA_LOG_ERROR("failed to parse grammar\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Ensure that there is a "root" node.
|
||||
if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
|
||||
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
||||
// Ensure that the grammar contains the start symbol
|
||||
if (parser.symbol_ids.find(grammar_root) == parser.symbol_ids.end()) {
|
||||
LLAMA_LOG_ERROR("grammar does not contain a '%s' symbol\n", grammar_root);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -1195,7 +1195,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu\n", i);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -24,6 +24,7 @@ class llama_kv_cache_context;
|
|||
class llama_kv_cache_iswa_context;
|
||||
class llama_memory_recurrent_context;
|
||||
class llama_memory_hybrid_context;
|
||||
class llama_memory_hybrid_iswa_context;
|
||||
|
||||
// certain models (typically multi-modal) can produce different types of graphs
|
||||
enum llm_graph_type {
|
||||
|
|
@ -105,7 +106,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
|||
|
||||
class llm_graph_input_embd : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_embd() = default;
|
||||
llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
|
||||
virtual ~llm_graph_input_embd() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
|
@ -114,6 +115,8 @@ public:
|
|||
|
||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||
|
||||
const int64_t n_embd = 0;
|
||||
};
|
||||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
|
|
@ -314,6 +317,39 @@ public:
|
|||
const llama_kv_cache_context * mctx;
|
||||
};
|
||||
|
||||
// V-less input for the KV cache
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/19067
|
||||
class llm_graph_input_attn_k : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_k(
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache_context * mctx) :
|
||||
hparams(hparams),
|
||||
cparams(cparams),
|
||||
mctx(mctx) {
|
||||
}
|
||||
~llm_graph_input_attn_k() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
|
||||
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
||||
|
||||
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
||||
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
const llama_hparams hparams;
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_kv_cache_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_kv_iswa(
|
||||
|
|
@ -397,6 +433,62 @@ public:
|
|||
const llama_memory_hybrid_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_mem_hybrid_k : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_mem_hybrid_k(
|
||||
const llama_cparams & cparams,
|
||||
std::unique_ptr<llm_graph_input_attn_k> inp_attn,
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||
const llama_memory_hybrid_context * mctx) :
|
||||
inp_attn(std::move(inp_attn)),
|
||||
inp_rs(std::move(inp_rs)),
|
||||
cparams(cparams),
|
||||
mctx(mctx) { }
|
||||
virtual ~llm_graph_input_mem_hybrid_k() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
std::unique_ptr<llm_graph_input_attn_k> inp_attn;
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||
|
||||
llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); }
|
||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_memory_hybrid_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_mem_hybrid_iswa(
|
||||
const llama_cparams & cparams,
|
||||
std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||
const llama_memory_hybrid_iswa_context * mctx) :
|
||||
inp_attn(std::move(inp_attn)),
|
||||
inp_rs(std::move(inp_rs)),
|
||||
cparams(cparams),
|
||||
mctx(mctx) { }
|
||||
virtual ~llm_graph_input_mem_hybrid_iswa() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
|
||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||
|
||||
llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
|
||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_memory_hybrid_iswa_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_sampling : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
|
||||
|
|
@ -537,7 +629,7 @@ public:
|
|||
|
||||
virtual ~llm_graph_result() = default;
|
||||
|
||||
ggml_tensor * get_tokens() const { return t_tokens; }
|
||||
ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
|
||||
ggml_tensor * get_logits() const { return t_logits; }
|
||||
ggml_tensor * get_embd() const { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
||||
|
|
@ -564,7 +656,8 @@ public:
|
|||
void set_params(const llm_graph_params & params);
|
||||
|
||||
// important graph nodes
|
||||
ggml_tensor * t_tokens = nullptr;
|
||||
ggml_tensor * t_inp_tokens = nullptr;
|
||||
ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
|
||||
ggml_tensor * t_logits = nullptr;
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
|
|
@ -671,10 +764,11 @@ struct llm_graph_context {
|
|||
ggml_tensor * cur,
|
||||
int il) const;
|
||||
|
||||
// do mat_mul, while optionally apply lora
|
||||
// do mat_mul, while optionally apply lora and per-tensor scale
|
||||
ggml_tensor * build_lora_mm(
|
||||
ggml_tensor * w,
|
||||
ggml_tensor * cur) const;
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * w_s = nullptr) const;
|
||||
|
||||
// do mat_mul_id, while optionally apply lora
|
||||
ggml_tensor * build_lora_mm_id(
|
||||
|
|
@ -717,11 +811,14 @@ struct llm_graph_context {
|
|||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in = nullptr) const;
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr,
|
||||
ggml_tensor * up_exps_s = nullptr,
|
||||
ggml_tensor * gate_exps_s = nullptr,
|
||||
ggml_tensor * down_exps_s = nullptr) const;
|
||||
|
||||
ggml_tensor * build_moe_ffn(
|
||||
ggml_tensor * cur,
|
||||
|
|
@ -738,11 +835,15 @@ struct llm_graph_context {
|
|||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in = nullptr) const;
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr,
|
||||
ggml_tensor * gate_up_exps_b = nullptr,
|
||||
ggml_tensor * up_exps_s = nullptr,
|
||||
ggml_tensor * gate_exps_s = nullptr,
|
||||
ggml_tensor * down_exps_s = nullptr) const;
|
||||
|
||||
//
|
||||
// inputs
|
||||
|
|
@ -801,6 +902,21 @@ struct llm_graph_context {
|
|||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
llm_graph_input_attn_k * build_attn_inp_k() const;
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_k * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
|
@ -880,6 +996,9 @@ struct llm_graph_context {
|
|||
//
|
||||
|
||||
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
||||
llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const;
|
||||
|
||||
llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
|
||||
|
||||
//
|
||||
// pooling
|
||||
|
|
@ -889,7 +1008,8 @@ struct llm_graph_context {
|
|||
ggml_tensor * cls,
|
||||
ggml_tensor * cls_b,
|
||||
ggml_tensor * cls_out,
|
||||
ggml_tensor * cls_out_b) const;
|
||||
ggml_tensor * cls_out_b,
|
||||
ggml_tensor * cls_norm) const;
|
||||
|
||||
//
|
||||
// sampling (backend sampling)
|
||||
|
|
@ -903,6 +1023,7 @@ struct llm_graph_context {
|
|||
|
||||
void build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_2_b,
|
||||
ggml_tensor * dense_3) const;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
|
|||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_rot(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_rot_swa : n_rot_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_inp() const {
|
||||
uint32_t n_embd_inp = n_embd;
|
||||
|
||||
|
|
@ -72,20 +80,36 @@ uint32_t llama_hparams::n_embd_inp() const {
|
|||
return n_embd_inp;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::get_n_embd_out() const {
|
||||
return n_embd_out > 0 ? n_embd_out : n_embd;
|
||||
uint32_t llama_hparams::n_embd_out() const {
|
||||
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_k * n_head_kv;
|
||||
return n_embd_head_k(il) * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_v * n_head_kv;
|
||||
return n_embd_head_v(il) * n_head_kv;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
||||
|
|
@ -139,6 +163,13 @@ uint32_t llama_hparams::n_embd_r() const {
|
|||
return n_embd * (n_shortconv_l_cache - 1);
|
||||
}
|
||||
|
||||
if (n_embd_head_kda != 0) {
|
||||
// for Kimi KDA layers
|
||||
// Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim
|
||||
const uint32_t d_inner = n_head() * n_embd_head_kda; // 32 * 128 = 4096
|
||||
return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner;
|
||||
}
|
||||
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
||||
// Corresponds to Mamba's conv_states size
|
||||
|
|
@ -151,6 +182,13 @@ uint32_t llama_hparams::n_embd_s() const {
|
|||
return n_embd * wkv_head_size;
|
||||
}
|
||||
|
||||
if (n_embd_head_kda != 0) {
|
||||
// for Kimi KDA layers
|
||||
// Full recurrent state: head_dim * head_dim * n_head
|
||||
// h tensor shape for delta attention: [head_dim, head_dim, n_head]
|
||||
return n_embd_head_kda * n_embd_head_kda * n_head(); // 128 * 128 * 32 = 524288
|
||||
}
|
||||
|
||||
// corresponds to Mamba's ssm_states size
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
|
|
@ -175,6 +213,21 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bool llama_hparams::is_mla() const {
|
||||
assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
|
||||
(n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
|
||||
|
||||
return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k_mla() const {
|
||||
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v_mla() const {
|
||||
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
|
||||
}
|
||||
|
||||
bool llama_hparams::has_kv(uint32_t il) const {
|
||||
if (n_layer_kv_from_start >= 0) {
|
||||
if (il < (uint32_t) n_layer_kv_from_start) {
|
||||
|
|
@ -200,42 +253,6 @@ uint32_t llama_hparams::n_layer_kv() const {
|
|||
return res;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_STANDARD:
|
||||
{
|
||||
if (p1 - p0 >= (int32_t) n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_CHUNKED:
|
||||
{
|
||||
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||
|
||||
if (p0 < pos_chunk_start) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_SYMMETRIC:
|
||||
{
|
||||
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
||||
const int32_t pos_diff = p1 - p0;
|
||||
|
||||
// Mask if outside the symmetric window
|
||||
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool llama_hparams::use_mrope() const {
|
||||
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
|
|
@ -41,19 +42,25 @@ struct llama_hparams {
|
|||
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
uint32_t n_embd_features = 0;
|
||||
uint32_t n_layer;
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
|
||||
// different head size for full_attention and SWA layers
|
||||
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_embd_head_k_swa;
|
||||
uint32_t n_embd_head_v_swa;
|
||||
|
||||
// different RoPE dimensions for full_attention and SWA layers
|
||||
uint32_t n_rot_full;
|
||||
uint32_t n_rot_swa;
|
||||
|
||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
uint32_t n_embd_head_k_mla = 0;
|
||||
uint32_t n_embd_head_v_mla = 0;
|
||||
uint32_t n_embd_head_k_mla_impl = 0;
|
||||
uint32_t n_embd_head_v_mla_impl = 0;
|
||||
|
||||
// for WavTokenizer
|
||||
struct llama_hparams_posnet posnet;
|
||||
|
|
@ -82,6 +89,7 @@ struct llama_hparams {
|
|||
bool expert_weights_norm = false;
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
uint32_t moe_every_n_layers = 0;
|
||||
uint32_t moe_latent_size = 0;
|
||||
uint32_t nextn_predict_layers = 0;
|
||||
|
||||
float f_norm_eps;
|
||||
|
|
@ -136,6 +144,9 @@ struct llama_hparams {
|
|||
uint32_t ssm_dt_rank = 0;
|
||||
uint32_t ssm_n_group = 0;
|
||||
|
||||
// for Kimi Linear KDA
|
||||
uint32_t n_embd_head_kda = 0;
|
||||
|
||||
// for hybrid state space models
|
||||
std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
|
||||
|
||||
|
|
@ -163,7 +174,7 @@ struct llama_hparams {
|
|||
uint32_t n_cls_out = 1;
|
||||
|
||||
// output embedding dimension (0 = use n_embd)
|
||||
uint32_t n_embd_out = 0;
|
||||
uint32_t n_embd_out_impl = 0;
|
||||
|
||||
// llama4 smallthinker
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
|
|
@ -190,11 +201,16 @@ struct llama_hparams {
|
|||
std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
|
||||
std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
|
||||
|
||||
// DSA (deepseek sparse attention)
|
||||
uint32_t indexer_n_head = 0;
|
||||
uint32_t indexer_head_size = 0;
|
||||
uint32_t indexer_top_k = 0;
|
||||
|
||||
// qwen3vl deepstack
|
||||
uint32_t n_deepstack_layers = 0;
|
||||
|
||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
uint32_t dec_n_layer = 0;
|
||||
|
||||
|
|
@ -202,6 +218,11 @@ struct llama_hparams {
|
|||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||
|
||||
|
||||
// Step35: optional per-layer clamps for (Swi)GLU
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert
|
||||
|
||||
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
||||
// dense_first means whether the pattern is start with a dense layer
|
||||
// note that if n_pattern == 0, all layers are SWA
|
||||
|
|
@ -234,11 +255,17 @@ struct llama_hparams {
|
|||
|
||||
uint32_t n_gqa(uint32_t il = 0) const;
|
||||
|
||||
uint32_t n_rot(uint32_t il = 0) const;
|
||||
|
||||
// dimension of main + auxiliary input embeddings
|
||||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of output embeddings
|
||||
uint32_t get_n_embd_out() const;
|
||||
uint32_t n_embd_out() const;
|
||||
|
||||
// dimension of key/value embeddings for each head (per layer)
|
||||
uint32_t n_embd_head_k(uint32_t il = 0) const;
|
||||
uint32_t n_embd_head_v(uint32_t il = 0) const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
|
@ -268,15 +295,57 @@ struct llama_hparams {
|
|||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
// note: currently only support if either all or none of the layers are MLA
|
||||
bool is_mla() const;
|
||||
|
||||
uint32_t n_embd_head_k_mla() const;
|
||||
uint32_t n_embd_head_v_mla() const;
|
||||
|
||||
bool has_kv(uint32_t il) const;
|
||||
|
||||
// number of layers for which has_kv() returns true
|
||||
uint32_t n_layer_kv() const;
|
||||
|
||||
// note that this function uses different SWA parameters from those in the hparams
|
||||
// note: inlined on purpose for performance reasons
|
||||
// TODO: think of a better place for this function
|
||||
// TODO: pack the SWA params in a struct?
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_STANDARD:
|
||||
{
|
||||
if (p1 - p0 >= (int32_t) n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_CHUNKED:
|
||||
{
|
||||
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||
|
||||
if (p0 < pos_chunk_start) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_SYMMETRIC:
|
||||
{
|
||||
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
||||
const int32_t pos_diff = p1 - p0;
|
||||
|
||||
// Mask if outside the symmetric window
|
||||
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool use_mrope() const;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -100,18 +100,18 @@ std::string format(const char * fmt, ...) {
|
|||
|
||||
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
||||
snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
|
||||
for (size_t i = 1; i < ne.size(); i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
|
||||
snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,6 +49,16 @@ struct time_meas {
|
|||
int64_t & t_acc;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct buffer_view {
|
||||
T * data;
|
||||
size_t size = 0;
|
||||
|
||||
bool has_data() const {
|
||||
return data && size > 0;
|
||||
}
|
||||
};
|
||||
|
||||
void replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||
|
||||
// TODO: rename to llama_format ?
|
||||
|
|
@ -60,4 +70,6 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
|||
|
||||
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
||||
|
||||
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
||||
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
||||
#define LLAMA_TENSOR_NAME_FGDN_AR "__fgdn_ar__"
|
||||
#define LLAMA_TENSOR_NAME_FGDN_CH "__fgdn_ch__"
|
||||
|
|
|
|||
|
|
@ -218,7 +218,9 @@ llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx,
|
|||
}
|
||||
|
||||
bool llama_kv_cache_iswa::get_can_shift() const {
|
||||
return kv_base->get_size() == kv_swa->get_size();
|
||||
return kv_base->get_can_shift() &&
|
||||
kv_swa->get_can_shift() &&
|
||||
kv_base->get_size() == kv_swa->get_size();
|
||||
}
|
||||
|
||||
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
|
|
|
|||
|
|
@ -97,6 +97,8 @@ llama_kv_cache::llama_kv_cache(
|
|||
__func__, hparams.n_embd_v_gqa_max());
|
||||
}
|
||||
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
||||
if (!hparams.has_kv(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
|
||||
|
|
@ -130,18 +132,21 @@ llama_kv_cache::llama_kv_cache(
|
|||
throw std::runtime_error("failed to create ggml context for kv cache");
|
||||
}
|
||||
|
||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
|
||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
|
||||
const bool has_k = true;
|
||||
const bool has_v = !is_mla;
|
||||
|
||||
ggml_format_name(k, "cache_k_l%d", il);
|
||||
ggml_format_name(v, "cache_v_l%d", il);
|
||||
ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
|
||||
ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
|
||||
|
||||
has_k && ggml_format_name(k, "cache_k_l%d", il);
|
||||
has_v && ggml_format_name(v, "cache_v_l%d", il);
|
||||
|
||||
std::vector<ggml_tensor *> k_stream;
|
||||
std::vector<ggml_tensor *> v_stream;
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
|
||||
v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
|
||||
k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
|
||||
v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
|
||||
}
|
||||
|
||||
map_layer_ids[il] = layers.size();
|
||||
|
|
@ -578,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
|
|||
break;
|
||||
}
|
||||
|
||||
// remeber the position that we found
|
||||
// remember the position that we found
|
||||
res.push_back(sinfo_new);
|
||||
|
||||
// store the old state of the cells in the recovery stack
|
||||
|
|
@ -647,7 +652,10 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_co
|
|||
const auto & layer = layers[il];
|
||||
|
||||
ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
|
||||
ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
|
||||
|
||||
if (layer.v_stream[ssrc]) {
|
||||
ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -852,7 +860,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|||
const llama_seq_id seq_id_cell = cells.seq_get(idx);
|
||||
|
||||
// SWA mask
|
||||
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
can_use = true;
|
||||
}
|
||||
}
|
||||
|
|
@ -966,6 +974,13 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
|
|||
}
|
||||
|
||||
bool llama_kv_cache::get_can_shift() const {
|
||||
// Step35 uses per-layer RoPE dims; K-shift assumes a single global n_rot.
|
||||
if (model.arch == LLM_ARCH_STEP35) {
|
||||
return false;
|
||||
}
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1018,8 +1033,8 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
|
||||
return ggml_view_4d(ctx, k,
|
||||
hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, hparams.n_embd_head_k),
|
||||
hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, hparams.n_embd_head_k(il)),
|
||||
ggml_row_size(k->type, n_embd_k_gqa),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
|
||||
|
|
@ -1041,8 +1056,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
if (!v_trans) {
|
||||
// note: v->nb[1] <= v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
|
||||
hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
|
||||
|
|
@ -1050,8 +1065,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
|
||||
// note: v->nb[1] > v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
|
||||
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
|
||||
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
|
||||
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, kv_size), // v->nb[2]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
||||
|
|
@ -1237,6 +1252,197 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
|
|||
}
|
||||
}
|
||||
|
||||
struct args_set_input_kq_mask {
|
||||
const llama_hparams & hparams;
|
||||
const llama_ubatch * ubatch;
|
||||
|
||||
const std::vector<llama_kv_cells> & v_cells;
|
||||
const std::vector<uint32_t> & seq_to_stream;
|
||||
|
||||
uint32_t n_swa;
|
||||
llama_swa_type swa_type;
|
||||
|
||||
int64_t n_kv;
|
||||
int64_t n_stream;
|
||||
int64_t n_tps;
|
||||
};
|
||||
|
||||
template<bool causal, bool swa, bool is_2d, bool alibi>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
//const auto & hparams = args.hparams;
|
||||
const auto & ubatch = args.ubatch;
|
||||
|
||||
const auto & v_cells = args.v_cells;
|
||||
const auto & seq_to_stream = args.seq_to_stream;
|
||||
|
||||
const uint32_t n_swa = args.n_swa;
|
||||
const llama_swa_type swa_type = args.swa_type;
|
||||
|
||||
const int64_t n_kv = args.n_kv;
|
||||
const int64_t n_stream = args.n_stream;
|
||||
const int64_t n_tps = args.n_tps;
|
||||
|
||||
// the min position in the batch for each sequence
|
||||
llama_pos seq_pos_min[LLAMA_MAX_SEQ];
|
||||
std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
|
||||
|
||||
for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
||||
|
||||
seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
|
||||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
// bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
|
||||
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
|
||||
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
|
||||
|
||||
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
||||
const uint32_t i = s*n_tps + ii;
|
||||
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
||||
|
||||
const auto & cells = v_cells.at(seq_to_stream[seq_id]);
|
||||
|
||||
llama_pos p0 = -1;
|
||||
const llama_pos p1 = ubatch->pos[i];
|
||||
|
||||
// for M-RoPE
|
||||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||
|
||||
const uint64_t idst = n_kv*i;
|
||||
|
||||
// for tokens of the same sequence, the mask is mostly the same, so we can reuse it
|
||||
// the only cells that could change are the ones that are with similar positions as the
|
||||
// ones in the batch (i.e. due to causal masking, SWA, etc.)
|
||||
// keep track of those cells and shortcut the loop to save time
|
||||
// note: this optimization is not compatible with Alibi position encoding
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18842
|
||||
bool prev = false;
|
||||
|
||||
auto & idxs = seq_idxs[seq_id];
|
||||
|
||||
if (!alibi) {
|
||||
if (seq_srct.find(seq_id) != seq_srct.end()) {
|
||||
const uint32_t srct = seq_srct[seq_id];
|
||||
|
||||
const uint64_t idst_prev = n_kv*srct;
|
||||
|
||||
std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
|
||||
|
||||
prev = true;
|
||||
} else {
|
||||
idxs.clear();
|
||||
idxs.reserve(ubatch->n_tokens + n_swa + 32);
|
||||
|
||||
seq_srct[seq_id] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t jj = 0; jj < n_kv; ++jj) {
|
||||
uint32_t j = jj;
|
||||
|
||||
// we have an exiting mask for this sequence -> update just seq_idxs
|
||||
if (!alibi) {
|
||||
if (prev) {
|
||||
if (jj >= idxs.size()) {
|
||||
break;
|
||||
}
|
||||
|
||||
j = idxs[jj];
|
||||
}
|
||||
}
|
||||
|
||||
if (cells.is_empty(j)) {
|
||||
goto skip;
|
||||
}
|
||||
|
||||
// mask the token if not the same sequence
|
||||
if (!cells.seq_has(j, seq_id)) {
|
||||
goto skip;
|
||||
}
|
||||
|
||||
p0 = cells.pos_get(j);
|
||||
|
||||
if (!alibi) {
|
||||
if (!prev) {
|
||||
// record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
|
||||
if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
|
||||
idxs.push_back(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (causal) {
|
||||
// mask future tokens
|
||||
if (p0 > p1) {
|
||||
goto skip;
|
||||
}
|
||||
|
||||
// M-RoPE causal mask
|
||||
if (is_2d) {
|
||||
if (p0 == p1) {
|
||||
const auto & p0_ext = cells.ext_get(j);
|
||||
|
||||
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
|
||||
goto skip;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// apply SWA if any
|
||||
if (swa) {
|
||||
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
|
||||
goto skip;
|
||||
}
|
||||
}
|
||||
|
||||
if (alibi) {
|
||||
data[idst + j] = -std::abs(p0 - p1);
|
||||
} else {
|
||||
data[idst + j] = 0.0f;
|
||||
}
|
||||
|
||||
continue;
|
||||
skip:
|
||||
data[idst + j] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<bool causal, bool swa, bool is_2d>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
const bool alibi = args.hparams.use_alibi;
|
||||
if (alibi) {
|
||||
set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
|
||||
}
|
||||
}
|
||||
|
||||
template<bool causal, bool swa>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
const bool is_2d = args.ubatch->is_pos_2d();
|
||||
if (is_2d) {
|
||||
set_input_kq_mask_impl<causal, swa, true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<causal, swa, false>(args, data);
|
||||
}
|
||||
}
|
||||
|
||||
template<bool causal>
|
||||
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
||||
const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
|
||||
if (swa) {
|
||||
set_input_kq_mask_impl<causal, true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<causal, false>(args, data);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
||||
const uint32_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
|
|
@ -1251,74 +1457,29 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
|||
// n_tps == n_tokens_per_stream
|
||||
const int64_t n_tps = n_tokens/n_stream;
|
||||
|
||||
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
||||
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
||||
// Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
|
||||
// Causal mask:
|
||||
// xxx-------
|
||||
// xxxx------
|
||||
// xxxxx-----
|
||||
// Non-causal mask:
|
||||
// xxxxx-----
|
||||
// xxxxx-----
|
||||
// xxxxx-----
|
||||
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
||||
// TODO: optimize this section
|
||||
for (uint32_t h = 0; h < 1; ++h) {
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
||||
const uint32_t i = s*n_tps + ii;
|
||||
const args_set_input_kq_mask args = {
|
||||
/*.hparams =*/ hparams,
|
||||
/*.ubatch =*/ ubatch,
|
||||
/*.v_cells =*/ v_cells,
|
||||
/*.seq_to_stream =*/ seq_to_stream,
|
||||
/*.n_swa =*/ n_swa,
|
||||
/*.swa_type =*/ swa_type,
|
||||
/*.n_kv =*/ n_kv,
|
||||
/*.n_stream =*/ n_stream,
|
||||
/*.n_tps =*/ n_tps,
|
||||
};
|
||||
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
||||
|
||||
const auto & cells = v_cells[seq_to_stream[seq_id]];
|
||||
|
||||
const llama_pos p1 = ubatch->pos[i];
|
||||
|
||||
// for M-RoPE
|
||||
const bool is_2d = ubatch->is_pos_2d();
|
||||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||
|
||||
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
|
||||
|
||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||
if (cells.is_empty(j)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// mask the token if not the same sequence
|
||||
if (!cells.seq_has(j, seq_id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_pos p0 = cells.pos_get(j);
|
||||
|
||||
// mask future tokens
|
||||
if (causal_attn && p0 > p1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// M-RoPE causal mask
|
||||
if (causal_attn && is_2d && p0 == p1) {
|
||||
const auto & p0_ext = cells.ext_get(j);
|
||||
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// apply SWA if any
|
||||
if (is_masked_swa(p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (causal_attn) {
|
||||
set_input_kq_mask_impl<true> (args, data);
|
||||
} else {
|
||||
set_input_kq_mask_impl<false>(args, data);
|
||||
}
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
|
||||
//LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
|
||||
}
|
||||
|
||||
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
||||
|
|
@ -1370,7 +1531,7 @@ size_t llama_kv_cache::size_v_bytes() const {
|
|||
size_t size_v_bytes = 0;
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
size_v_bytes += ggml_nbytes(layer.v);
|
||||
size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0;
|
||||
}
|
||||
|
||||
return size_v_bytes;
|
||||
|
|
@ -1383,7 +1544,8 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const {
|
||||
float freq_scale,
|
||||
uint32_t il) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
|
|
@ -1391,7 +1553,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & n_rot = hparams.n_rot(il);
|
||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||
// @ngxson : this is a workaround
|
||||
// for M-RoPE, we want to rotate the whole vector when doing KV shift
|
||||
|
|
@ -1445,9 +1607,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
auto * ctx = res->get_ctx();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
||||
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
|
||||
|
|
@ -1461,6 +1620,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
const auto n_rot = hparams.n_rot(il);
|
||||
const auto n_embd_head_k = hparams.n_embd_head_k(il);
|
||||
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
|
||||
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
|
|
@ -1468,12 +1631,12 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
|
||||
ggml_tensor * k =
|
||||
ggml_view_3d(ctx, layer.k,
|
||||
n_embd_head_k, n_head_kv, get_size()*n_stream,
|
||||
n_rot, n_head_kv, get_size()*n_stream,
|
||||
ggml_row_size(layer.k->type, n_embd_head_k),
|
||||
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
||||
0);
|
||||
ggml_row_size(layer.k->type, n_embd_nope));
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
|
@ -1483,10 +1646,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
return gf;
|
||||
}
|
||||
|
||||
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
||||
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
|
||||
}
|
||||
|
||||
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
|
|
@ -1599,8 +1758,10 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t
|
|||
io.write(&pos, sizeof(pos));
|
||||
io.write(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
// TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
|
||||
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
const llama_kv_cell_ext ext = cells.ext_get(i);
|
||||
io.write(&ext, sizeof(ext));
|
||||
}
|
||||
|
||||
for (const auto & seq_id : seq_ids) {
|
||||
io.write(&seq_id, sizeof(seq_id));
|
||||
|
|
@ -1618,8 +1779,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
io.write(&v_trans, sizeof(v_trans));
|
||||
io.write(&n_layer, sizeof(n_layer));
|
||||
|
||||
std::vector<uint8_t> tmp_buf;
|
||||
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (const auto & layer : layers) {
|
||||
|
|
@ -1637,7 +1796,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
|
||||
io.write(&k_size_row, sizeof(k_size_row));
|
||||
|
||||
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||
// Read each range of cells of k_size length and write out
|
||||
for (const auto & range : cr.data) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * k_size_row;
|
||||
|
|
@ -1652,6 +1811,9 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[cr.strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Write value type
|
||||
const int32_t v_type_i = (int32_t) v->type;
|
||||
|
|
@ -1661,7 +1823,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
|
||||
io.write(&v_size_row, sizeof(v_size_row));
|
||||
|
||||
// Read each range of cells of v_size length each into tmp_buf and write out
|
||||
// Read each range of cells of v_size length and write out
|
||||
for (const auto & range : cr.data) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * v_size_row;
|
||||
|
|
@ -1678,6 +1840,9 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[cr.strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Write value type
|
||||
const int32_t v_type_i = (int32_t) v->type;
|
||||
|
|
@ -1692,7 +1857,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
|
||||
// For each row, we get the element values of each cell
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
||||
// Read each range of cells of v_size_el length and write out
|
||||
for (const auto & range : cr.data) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
||||
|
|
@ -1730,6 +1895,14 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
return false;
|
||||
}
|
||||
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
llama_kv_cell_ext ext;
|
||||
io.read_to(&ext, sizeof(ext));
|
||||
|
||||
ubatch.pos[i + ubatch.n_tokens] = ext.y;
|
||||
ubatch.pos[i + ubatch.n_tokens*2] = ext.x;
|
||||
}
|
||||
|
||||
// read the sequence id, but directly discard it - we will use dest_seq_id instead
|
||||
{
|
||||
llama_seq_id seq_id;
|
||||
|
|
@ -1780,6 +1953,12 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
|
||||
cells.pos_set(i, pos);
|
||||
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
llama_kv_cell_ext ext;
|
||||
io.read_to(&ext, sizeof(ext));
|
||||
cells.ext_set(i, ext);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
llama_seq_id seq_id;
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
|
|
@ -1881,6 +2060,9 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read type of value
|
||||
int32_t v_type_i_ref;
|
||||
|
|
@ -1922,6 +2104,9 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read type of value
|
||||
int32_t v_type_i_ref;
|
||||
|
|
|
|||
|
|
@ -257,8 +257,6 @@ private:
|
|||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
bool is_masked_swa(llama_pos p0, llama_pos p1) const;
|
||||
|
||||
ggml_tensor * build_rope_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
|
|
@ -266,7 +264,8 @@ private:
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
float freq_scale,
|
||||
uint32_t il) const;
|
||||
|
||||
ggml_cgraph * build_graph_shift(
|
||||
llm_graph_result * res,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,275 @@
|
|||
#include "llama-memory-hybrid-iswa.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-context.h"
|
||||
|
||||
//
|
||||
// llama_memory_hybrid_iswa
|
||||
//
|
||||
|
||||
llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
|
||||
const llama_model & model,
|
||||
/* attn */
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool swa_full,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_ubatch,
|
||||
uint32_t n_pad,
|
||||
/* recurrent */
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
const layer_filter_cb & filter_attn,
|
||||
const layer_filter_cb & filter_recr) :
|
||||
hparams(model.hparams),
|
||||
mem_attn(new llama_kv_cache_iswa(
|
||||
model,
|
||||
type_k,
|
||||
type_v,
|
||||
v_trans,
|
||||
offload,
|
||||
swa_full,
|
||||
unified,
|
||||
kv_size,
|
||||
n_seq_max,
|
||||
n_ubatch,
|
||||
n_pad,
|
||||
filter_attn == nullptr ?
|
||||
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
||||
: filter_attn,
|
||||
nullptr
|
||||
)),
|
||||
mem_recr(new llama_memory_recurrent(
|
||||
model,
|
||||
type_r,
|
||||
type_s,
|
||||
offload,
|
||||
rs_size,
|
||||
n_seq_max,
|
||||
filter_recr == nullptr ?
|
||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||
: filter_recr
|
||||
)) {}
|
||||
|
||||
llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||
do {
|
||||
balloc.split_reset();
|
||||
|
||||
// follow the recurrent pattern for creating the ubatch splits
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (true) {
|
||||
llama_ubatch ubatch;
|
||||
|
||||
if (embd_all) {
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
ubatches.push_back(std::move(ubatch)); // NOLINT
|
||||
}
|
||||
|
||||
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
||||
// failed to find a suitable split
|
||||
break;
|
||||
}
|
||||
|
||||
// prepare the recurrent batches first
|
||||
if (!mem_recr->prepare(ubatches)) {
|
||||
// TODO: will the recurrent cache be in an undefined context at this point?
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
|
||||
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
// prepare the attention cache (iswa version returns both base and swa slot infos)
|
||||
auto sinfos_base = mem_attn->get_base()->prepare(ubatches);
|
||||
if (sinfos_base.empty()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare attention base ubatches\n", __func__);
|
||||
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
auto sinfos_swa = mem_attn->get_swa()->prepare(ubatches);
|
||||
if (sinfos_swa.empty()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare attention swa ubatches\n", __func__);
|
||||
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
return std::make_unique<llama_memory_hybrid_iswa_context>(
|
||||
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
||||
} while(false);
|
||||
|
||||
return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_memory_hybrid_iswa::init_full() {
|
||||
return std::make_unique<llama_memory_hybrid_iswa_context>(this);
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * lctx, bool optimize) {
|
||||
return std::make_unique<llama_memory_hybrid_iswa_context>(this, lctx, optimize);
|
||||
}
|
||||
|
||||
bool llama_memory_hybrid_iswa::get_can_shift() const {
|
||||
// Shifting is trivially supported for recurrent
|
||||
return mem_attn->get_can_shift();
|
||||
}
|
||||
|
||||
void llama_memory_hybrid_iswa::clear(bool data) {
|
||||
mem_attn->clear(data);
|
||||
mem_recr->clear(data);
|
||||
}
|
||||
|
||||
bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
// Try removing from the recurrent cache first since it may fail. If it does
|
||||
// fail, the cache will not have been mutated.
|
||||
if (!mem_recr->seq_rm(seq_id, p0, p1)) {
|
||||
return false;
|
||||
}
|
||||
return mem_attn->seq_rm(seq_id, p0, p1);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||
mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
|
||||
mem_attn->seq_keep(seq_id);
|
||||
mem_recr->seq_keep(seq_id);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
||||
mem_attn->seq_add(seq_id, p0, p1, shift);
|
||||
mem_recr->seq_add(seq_id, p0, p1, shift);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
mem_attn->seq_div(seq_id, p0, p1, d);
|
||||
mem_recr->seq_div(seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
llama_pos llama_memory_hybrid_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
||||
// the min of the total cache is the max of the two caches' min values
|
||||
return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
|
||||
}
|
||||
|
||||
llama_pos llama_memory_hybrid_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||
// the max of the total cache is the min of the two caches' max values
|
||||
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
||||
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
||||
mb[buft_size.first] += buft_size.second;
|
||||
}
|
||||
return mb;
|
||||
}
|
||||
|
||||
void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
mem_attn->state_write(io, seq_id, flags);
|
||||
mem_recr->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
mem_attn->state_read(io, seq_id, flags);
|
||||
mem_recr->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
|
||||
return mem_attn.get();
|
||||
}
|
||||
|
||||
llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
|
||||
return mem_recr.get();
|
||||
}
|
||||
|
||||
//
|
||||
// llama_memory_hybrid_iswa_context
|
||||
//
|
||||
|
||||
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
|
||||
|
||||
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
|
||||
ctx_attn(mem->get_mem_attn()->init_full()),
|
||||
ctx_recr(mem->get_mem_recr()->init_full()),
|
||||
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
||||
}
|
||||
|
||||
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
|
||||
llama_memory_hybrid_iswa * mem,
|
||||
llama_context * lctx,
|
||||
bool optimize) :
|
||||
ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
|
||||
ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
|
||||
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
||||
}
|
||||
|
||||
llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
|
||||
llama_memory_hybrid_iswa * mem,
|
||||
slot_info_vec_t sinfos_base,
|
||||
slot_info_vec_t sinfos_swa,
|
||||
std::vector<llama_ubatch> ubatches) :
|
||||
ubatches(std::move(ubatches)),
|
||||
// note: here we copy the ubatches. not sure if this is ideal
|
||||
ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
|
||||
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
||||
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
||||
}
|
||||
|
||||
bool llama_memory_hybrid_iswa_context::next() {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
ctx_attn->next();
|
||||
ctx_recr->next();
|
||||
|
||||
if (++i_next >= ubatches.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_memory_hybrid_iswa_context::apply() {
|
||||
assert(!llama_memory_status_is_fail(status));
|
||||
|
||||
bool res = true;
|
||||
|
||||
res = res & ctx_attn->apply();
|
||||
res = res & ctx_recr->apply();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llama_memory_status llama_memory_hybrid_iswa_context::get_status() const {
|
||||
return status;
|
||||
}
|
||||
|
||||
const llama_ubatch & llama_memory_hybrid_iswa_context::get_ubatch() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
return ubatches[i_next];
|
||||
}
|
||||
|
||||
const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const {
|
||||
return static_cast<const llama_kv_cache_iswa_context *>(ctx_attn.get());
|
||||
}
|
||||
|
||||
const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
|
||||
return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama-batch.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-kv-cache-iswa.h"
|
||||
#include "llama-memory.h"
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// llama_memory_hybrid_iswa
|
||||
//
|
||||
|
||||
// utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to
|
||||
// support models where each layer may be either attention-based (with SWA support) or recurrent
|
||||
|
||||
class llama_memory_hybrid_iswa : public llama_memory_i {
|
||||
public:
|
||||
llama_memory_hybrid_iswa(
|
||||
const llama_model & model,
|
||||
/* attn */
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool swa_full,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_ubatch,
|
||||
uint32_t n_pad,
|
||||
/* recurrent */
|
||||
ggml_type type_r,
|
||||
ggml_type type_s,
|
||||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
const layer_filter_cb & filter_attn = nullptr,
|
||||
const layer_filter_cb & filter_recr = nullptr);
|
||||
|
||||
~llama_memory_hybrid_iswa() = default;
|
||||
|
||||
//
|
||||
// llama_memory_i
|
||||
//
|
||||
|
||||
llama_memory_context_ptr init_batch(
|
||||
llama_batch_allocr & balloc,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_all) override;
|
||||
|
||||
llama_memory_context_ptr init_full() override;
|
||||
|
||||
llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
void clear(bool data) override;
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
//
|
||||
// llama_memory_hybrid_iswa specific API
|
||||
//
|
||||
|
||||
llama_kv_cache_iswa * get_mem_attn() const;
|
||||
llama_memory_recurrent * get_mem_recr() const;
|
||||
|
||||
private:
|
||||
const llama_hparams & hparams;
|
||||
|
||||
const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
|
||||
const std::unique_ptr<llama_memory_recurrent> mem_recr;
|
||||
};
|
||||
|
||||
class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
|
||||
public:
|
||||
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
||||
|
||||
// init failure
|
||||
explicit llama_memory_hybrid_iswa_context(llama_memory_status status);
|
||||
|
||||
// init full
|
||||
explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem);
|
||||
|
||||
// init update
|
||||
explicit llama_memory_hybrid_iswa_context(
|
||||
llama_memory_hybrid_iswa * mem,
|
||||
llama_context * lctx,
|
||||
bool optimize);
|
||||
|
||||
// init success
|
||||
llama_memory_hybrid_iswa_context(
|
||||
llama_memory_hybrid_iswa * mem,
|
||||
slot_info_vec_t sinfos_base,
|
||||
slot_info_vec_t sinfos_swa,
|
||||
std::vector<llama_ubatch> ubatches);
|
||||
|
||||
~llama_memory_hybrid_iswa_context() = default;
|
||||
|
||||
bool next() override;
|
||||
bool apply() override;
|
||||
|
||||
llama_memory_status get_status() const override;
|
||||
const llama_ubatch & get_ubatch() const override;
|
||||
|
||||
//
|
||||
// llama_memory_hybrid_iswa_context
|
||||
//
|
||||
|
||||
const llama_kv_cache_iswa_context * get_attn() const;
|
||||
const llama_memory_recurrent_context * get_recr() const;
|
||||
|
||||
private:
|
||||
// the index of the next ubatch to process
|
||||
size_t i_next = 0;
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
const llama_memory_context_ptr ctx_attn;
|
||||
const llama_memory_context_ptr ctx_recr;
|
||||
|
||||
const llama_memory_status status;
|
||||
};
|
||||
|
|
@ -163,7 +163,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|||
const auto & cell = cells[tail_id];
|
||||
// partial intersection is invalid if it includes the final pos
|
||||
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
|
||||
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
|
||||
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
|
||||
return false;
|
||||
}
|
||||
// invalidate tails which will be cleared
|
||||
|
|
@ -785,23 +785,21 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
io.write(&s_trans, sizeof(s_trans));
|
||||
io.write(&n_layer, sizeof(n_layer));
|
||||
|
||||
std::vector<uint8_t> tmp_buf;
|
||||
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Iterate and write all the R tensors first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Write key type
|
||||
// Write R tensor type
|
||||
const int32_t r_type_i = (int32_t)r_l[il]->type;
|
||||
io.write(&r_type_i, sizeof(r_type_i));
|
||||
|
||||
// Write row size of key
|
||||
// Write row size of R tensor
|
||||
const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
|
||||
io.write(&r_size_row, sizeof(r_size_row));
|
||||
|
||||
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||
// Write each range of cells of r_size_row length
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * r_size_row;
|
||||
|
|
@ -814,15 +812,15 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Write value type
|
||||
// Write S tensor type
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
io.write(&s_type_i, sizeof(s_type_i));
|
||||
|
||||
// Write row size of value
|
||||
// Write row size of S tensor
|
||||
const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
|
||||
io.write(&s_size_row, sizeof(s_size_row));
|
||||
|
||||
// Read each range of cells of s_size length each into tmp_buf and write out
|
||||
// Write each range of S tensor rows
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * s_size_row;
|
||||
|
|
@ -830,7 +828,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// When v is transposed, we also need the element size and get the element ranges from each row
|
||||
// When S tensor is transposed, we also need the element size and get the element ranges from each row
|
||||
const uint32_t mem_size = size;
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
|
|
@ -838,7 +836,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Write value type
|
||||
// Write S tensor type
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
io.write(&s_type_i, sizeof(s_type_i));
|
||||
|
||||
|
|
@ -851,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
|
||||
// For each row, we get the element values of each cell
|
||||
for (uint32_t j = 0; j < n_embd_s; ++j) {
|
||||
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
||||
// Write each range of cells of s_size_el length
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t src_offset = (range.first + j * mem_size) * s_size_el;
|
||||
|
|
|
|||
|
|
@ -244,11 +244,14 @@ struct llama_file::impl {
|
|||
}
|
||||
errno = 0;
|
||||
if (fd == -1) {
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
const size_t curr_off = tell();
|
||||
const size_t to_read = std::min(len, size - curr_off);
|
||||
|
||||
std::size_t ret = std::fread(ptr, to_read, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
if (to_read > 0 && ret != 1) {
|
||||
throw std::runtime_error("unexpectedly reached end of file");
|
||||
}
|
||||
} else {
|
||||
|
|
@ -262,7 +265,8 @@ struct llama_file::impl {
|
|||
continue; // Interrupted by signal, retry
|
||||
}
|
||||
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
||||
if (errno == EFAULT) {
|
||||
if (errno == EFAULT || errno == EINVAL) {
|
||||
LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
|
||||
auto curr_off = tell();
|
||||
close(fd);
|
||||
fd = -1;
|
||||
|
|
@ -381,6 +385,9 @@ int llama_file::file_id() const {
|
|||
#ifdef _WIN32
|
||||
return _fileno(pimpl->fp);
|
||||
#else
|
||||
if (pimpl->fd != -1) {
|
||||
return pimpl->fd;
|
||||
}
|
||||
#if defined(fileno)
|
||||
return fileno(pimpl->fp);
|
||||
#else
|
||||
|
|
@ -497,6 +504,8 @@ struct llama_mmap::impl {
|
|||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
HANDLE hMapping = nullptr;
|
||||
|
||||
impl(struct llama_file * file, size_t prefetch, bool numa) {
|
||||
GGML_UNUSED(numa);
|
||||
|
||||
|
|
@ -504,7 +513,7 @@ struct llama_mmap::impl {
|
|||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
|
||||
if (hMapping == NULL) {
|
||||
DWORD error = GetLastError();
|
||||
|
|
@ -513,9 +522,9 @@ struct llama_mmap::impl {
|
|||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
DWORD error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
CloseHandle(hMapping);
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
|
|
@ -547,9 +556,17 @@ struct llama_mmap::impl {
|
|||
}
|
||||
|
||||
~impl() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
if (hMapping) {
|
||||
if (addr) {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
if (!CloseHandle(hMapping)) {
|
||||
LLAMA_LOG_WARN("warning: CloseHandle failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
|
@ -611,9 +628,9 @@ struct llama_mlock::impl {
|
|||
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
|
||||
// visionOS/tvOS dont't support RLIMIT_MEMLOCK
|
||||
// Skip resource limit checks on visionOS/tvOS
|
||||
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
|
||||
// visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
|
||||
// Skip resource limit checks on these platforms
|
||||
suggest = false;
|
||||
#else
|
||||
struct rlimit lock_limit;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,17 @@
|
|||
#include "llama-model-loader.h"
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cinttypes>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <future>
|
||||
#include <regex>
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
|
|
@ -36,6 +42,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
||||
case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
|
|
@ -262,7 +269,7 @@ namespace GGUFMeta {
|
|||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
const int kid = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (required) {
|
||||
|
|
@ -272,7 +279,7 @@ namespace GGUFMeta {
|
|||
}
|
||||
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
||||
|
||||
|
||||
result = arr_info.length;
|
||||
|
|
@ -289,7 +296,7 @@ namespace GGUFMeta {
|
|||
|
||||
template<typename T>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
|
||||
const gguf_context * ctx = meta.get();
|
||||
const gguf_context * ctx = metadata;
|
||||
const int kid = gguf_find_key(ctx, key.c_str());
|
||||
|
||||
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
||||
|
|
@ -330,7 +337,7 @@ namespace GGUFMeta {
|
|||
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const gguf_context * ctx = meta.get();
|
||||
const gguf_context * ctx = metadata;
|
||||
const int kid = gguf_find_key(ctx, key.c_str());
|
||||
|
||||
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
||||
|
|
@ -344,6 +351,7 @@ namespace GGUFMeta {
|
|||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
|
||||
|
||||
switch (arr_info.gt) {
|
||||
case GGUF_TYPE_BOOL:
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
||||
(std::is_same<T, uint32_t>::value)); break;
|
||||
|
|
@ -365,7 +373,13 @@ namespace GGUFMeta {
|
|||
result[i] = value;
|
||||
}
|
||||
} else {
|
||||
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
||||
if (arr_info.gt == GGUF_TYPE_BOOL) {
|
||||
std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
|
||||
return static_cast<T>(x);
|
||||
});
|
||||
} else {
|
||||
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
@ -385,7 +399,7 @@ namespace GGUFMeta {
|
|||
const struct llama_model_kv_override * override =
|
||||
it != kv_overrides.end() ? &it->second : nullptr;
|
||||
|
||||
const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
|
||||
const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
|
||||
|
||||
if (required && !found) {
|
||||
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
||||
|
|
@ -419,7 +433,7 @@ namespace GGUFMeta {
|
|||
// get array of n <= N_MAX elements, or a single element repeated n times
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
const int kid = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (required) {
|
||||
|
|
@ -432,9 +446,9 @@ namespace GGUFMeta {
|
|||
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
||||
}
|
||||
|
||||
if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
|
||||
if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
||||
|
||||
if (n != arr_info.length) {
|
||||
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
|
||||
|
|
@ -465,7 +479,7 @@ namespace GGUFMeta {
|
|||
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
||||
const std::string key = llm_kv(kid);
|
||||
|
||||
const int id = gguf_find_key(meta.get(), key.c_str());
|
||||
const int id = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (id < 0) {
|
||||
if (required) {
|
||||
|
|
@ -475,7 +489,7 @@ namespace GGUFMeta {
|
|||
}
|
||||
|
||||
// throw and error if type is an array
|
||||
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
|
||||
if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
|
||||
if (required) {
|
||||
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
||||
}
|
||||
|
|
@ -492,6 +506,9 @@ namespace GGUFMeta {
|
|||
|
||||
|
||||
llama_model_loader::llama_model_loader(
|
||||
struct gguf_context * meta,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
|
|
@ -499,7 +516,8 @@ llama_model_loader::llama_model_loader(
|
|||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
|
||||
: metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
|
|
@ -513,130 +531,142 @@ llama_model_loader::llama_model_loader(
|
|||
|
||||
tensor_buft_overrides = param_tensor_buft_overrides_p;
|
||||
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
if (!fname.empty()) {
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
|
||||
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
if (!meta) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
||||
|
||||
// Disable mmap in case Direct I/O is enabled and available
|
||||
if (use_direct_io && use_mmap) {
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
|
||||
}
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
// make sure the main file is loaded first
|
||||
uint16_t idx = 0;
|
||||
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
||||
get_key(kv_split_no, idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
||||
metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
metadata = metadata_ptr.get();
|
||||
if (metadata == nullptr) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
// generate list of splits if needed
|
||||
if (splits.empty()) {
|
||||
splits = llama_get_list_splits(fname, idx, n_split);
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
if (use_mmap && use_direct_io) {
|
||||
if (files.back()->has_direct_io()) {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
use_mmap = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
use_direct_io = false;
|
||||
|
||||
// reopen file using std::fopen for mmap
|
||||
files.pop_back();
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
}
|
||||
}
|
||||
|
||||
// in case user give a custom list of splits, check if it matches the expected number
|
||||
if (n_split != (uint16_t)splits.size()) {
|
||||
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
||||
}
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
// load other splits
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
// make sure the main file is loaded first
|
||||
uint16_t idx = 0;
|
||||
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
||||
get_key(kv_split_no, idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
||||
}
|
||||
|
||||
// check idx
|
||||
// generate list of splits if needed
|
||||
if (splits.empty()) {
|
||||
splits = llama_get_list_splits(fname, idx, n_split);
|
||||
}
|
||||
|
||||
// in case user give a custom list of splits, check if it matches the expected number
|
||||
if (n_split != (uint16_t)splits.size()) {
|
||||
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
||||
}
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
// load other splits
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
||||
}
|
||||
|
||||
// check idx
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
||||
if (kid < 0) {
|
||||
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
||||
}
|
||||
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
||||
if (idx_gguf != idx) {
|
||||
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
||||
}
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||
|
||||
// sanity check
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
||||
if (kid < 0) {
|
||||
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
||||
}
|
||||
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
||||
if (idx_gguf != idx) {
|
||||
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
||||
const int n_tensors_loaded = (int) weights_map.size();
|
||||
if (n_tensors != n_tensors_loaded) {
|
||||
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||
|
||||
// sanity check
|
||||
{
|
||||
const int n_tensors_loaded = (int) weights_map.size();
|
||||
if (n_tensors != n_tensors_loaded) {
|
||||
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
} else {
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
}
|
||||
|
||||
n_kv = gguf_get_n_kv(meta.get());
|
||||
n_kv = gguf_get_n_kv(metadata);
|
||||
n_tensors = weights_map.size();
|
||||
|
||||
fver = (enum llama_fver) gguf_get_version(meta.get());
|
||||
fver = (enum llama_fver) gguf_get_version(metadata);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
||||
|
|
@ -695,6 +725,7 @@ llama_model_loader::llama_model_loader(
|
|||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
|
@ -715,14 +746,14 @@ llama_model_loader::llama_model_loader(
|
|||
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(meta.get(), i);
|
||||
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
||||
const char * name = gguf_get_key(metadata, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(metadata, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(meta.get(), i);
|
||||
std::string value = gguf_kv_to_str(metadata, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
|
|
@ -824,15 +855,382 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
|||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
||||
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(w != nullptr);
|
||||
|
||||
if (op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
||||
if (!ctx_ptr) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
|
||||
ggml_tensor * op_tensor = nullptr;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_get_rows(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul_mat(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const int n_expert_used = hparams.n_expert_used;
|
||||
GGML_ASSERT(n_expert_used > 0);
|
||||
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_add(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ADD_ID:
|
||||
{
|
||||
const int n_expert_used = hparams.n_expert_used;
|
||||
GGML_ASSERT(n_expert_used > 0);
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_add_id(ctx, a, w, c);
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
|
||||
op_tensor = ggml_div(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int n_embd_head = hparams.n_embd_head_v();
|
||||
const int n_head = hparams.n_head();
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_rope_ext(
|
||||
ctx, a, b, w,
|
||||
0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0
|
||||
);
|
||||
|
||||
} break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
{
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
||||
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
||||
} break;
|
||||
case GGML_OP_SSM_SCAN:
|
||||
{
|
||||
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
||||
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
||||
const int64_t n_head = w->ne[1];
|
||||
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
||||
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
||||
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
||||
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
{
|
||||
// FIXME
|
||||
const int64_t S = 123;
|
||||
const int64_t H = 123;
|
||||
const int64_t n_tokens = 123;
|
||||
const int64_t n_seqs = 123;
|
||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * tf = w;
|
||||
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
||||
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
const int n_embd_inp = hparams.n_embd_inp();
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
||||
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
op_tensor = ggml_scale(ctx, w, 1.0f);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
||||
}
|
||||
|
||||
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
||||
GGML_ASSERT(w->buffer == nullptr);
|
||||
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
ggml_backend_buffer_free(w->buffer);
|
||||
w->buffer = nullptr;
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
// find the first buffer type in the list that can use the tensor
|
||||
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
|
||||
GGML_ASSERT(!buft_list->empty());
|
||||
for (const auto & cur : *buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor(
|
||||
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
||||
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
|
||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
// one ggml context per buffer type
|
||||
int max_n_tensors = n_tensors;
|
||||
max_n_tensors += 1; // duplicated output tensor
|
||||
max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
|
||||
if (files.empty()) {
|
||||
max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
|
||||
}
|
||||
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
|
||||
ctx_map.emplace(buft, ctx);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
return it->second.get();
|
||||
};
|
||||
|
||||
auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
|
||||
if (!t_meta) {
|
||||
if (flags & TENSOR_NOT_REQUIRED) {
|
||||
return nullptr;
|
||||
}
|
||||
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
||||
// the tensor is duplicated
|
||||
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
||||
llm_tensor tn_tensor = tn.tensor;
|
||||
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
|
||||
tn_tensor = LLM_TENSOR_OUTPUT;
|
||||
}
|
||||
|
||||
llm_tensor_info info;
|
||||
try {
|
||||
info = llm_tensor_info_for(tn_tensor);
|
||||
} catch (const std::out_of_range & e) {
|
||||
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// skip unused tensors
|
||||
if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
|
||||
const size_t nbytes = ggml_nbytes(t_meta);
|
||||
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
||||
|
||||
size_data -= nbytes;
|
||||
n_created++;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
||||
ggml_op op;
|
||||
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
||||
if (bias) {
|
||||
if (info.op == GGML_OP_MUL_MAT_ID) {
|
||||
op = GGML_OP_ADD_ID;
|
||||
} else {
|
||||
op = GGML_OP_ADD;
|
||||
}
|
||||
} else {
|
||||
op = info.op;
|
||||
}
|
||||
|
||||
// sanity checks
|
||||
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
||||
if (tn.bid != -1) {
|
||||
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
||||
}
|
||||
} else {
|
||||
if (tn.bid == -1) {
|
||||
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// select the buffer type for this tensor
|
||||
const buft_list_t * buft_list;
|
||||
switch (info.layer) {
|
||||
case LLM_TENSOR_LAYER_INPUT:
|
||||
buft_list = buft_list_input;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_OUTPUT:
|
||||
buft_list = buft_list_output;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_REPEATING:
|
||||
GGML_ASSERT(buft_list_layer != nullptr);
|
||||
buft_list = buft_list_layer;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
// check overrides
|
||||
if (tensor_buft_overrides) {
|
||||
std::string tensor_name = tn.str();
|
||||
for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(tensor_name, pattern)) {
|
||||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||
// when overriding to a CPU buffer, consider the extra buffer types
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
||||
} else {
|
||||
buft = overrides->buft;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||
tensor_name.c_str(),
|
||||
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||
ggml_backend_buft_name(buft));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!buft) {
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list);
|
||||
if (!buft) {
|
||||
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
// avoid using a host buffer when using mmap
|
||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||
if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
|
||||
if (buft != buft_list->front().second) {
|
||||
if (n_tensors_moved == 0) {
|
||||
first_tensor_moved_name = t_meta->name;
|
||||
first_tensor_moved_type_name = ggml_type_name(t_meta->type);
|
||||
first_moved_from_buft = buft_list->front().second;
|
||||
first_moved_to_buft = buft;
|
||||
}
|
||||
n_tensors_moved++;
|
||||
}
|
||||
|
||||
return buft;
|
||||
};
|
||||
|
||||
if (files.empty()) {
|
||||
if (flags & TENSOR_SKIP_IF_VIRTUAL) {
|
||||
return nullptr;
|
||||
}
|
||||
ggml_type type = GGML_TYPE_F32;
|
||||
const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
|
||||
if (tid != -1) {
|
||||
type = gguf_get_tensor_type(metadata, tid);
|
||||
}
|
||||
|
||||
// for tensors that are not required some of the dimensions can be invalid:
|
||||
if (flags & TENSOR_NOT_REQUIRED) {
|
||||
for (size_t dim = 0; dim < ne.size(); dim++) {
|
||||
if (ne.begin()[dim] <= 0) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor t_meta;
|
||||
memset(&t_meta, 0, sizeof(ggml_tensor));
|
||||
t_meta.type = type;
|
||||
for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
|
||||
GGML_ASSERT(t_meta.ne[dim] >= 1);
|
||||
t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
|
||||
GGML_ASSERT(t_meta.nb[dim] >= 1);
|
||||
}
|
||||
ggml_set_name(&t_meta, tn.str().c_str());
|
||||
|
||||
ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
|
||||
GGML_ASSERT(buft != nullptr);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
|
||||
ggml_set_name(ret, tn.str().c_str());
|
||||
return ret;
|
||||
}
|
||||
|
||||
ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
|
||||
ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
|
||||
if (buft == nullptr) {
|
||||
return nullptr; // return type is ggml_tensor *
|
||||
}
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
|
||||
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
||||
if (flags & TENSOR_DUPLICATED) {
|
||||
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
|
||||
if (t) {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
|
||||
const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||
|
||||
if (cur == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool duplicated = flags & TENSOR_DUPLICATED;
|
||||
const bool duplicated = flags & TENSOR_DUPLICATED;
|
||||
|
||||
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
||||
ggml_set_name(tensor, ggml_get_name(cur));
|
||||
|
|
@ -844,7 +1242,6 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
|
|||
}
|
||||
|
||||
return tensor;
|
||||
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
|
||||
|
|
@ -879,6 +1276,11 @@ void llama_model_loader::done_getting_tensors() const {
|
|||
if (n_created != n_tensors) {
|
||||
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
||||
}
|
||||
if (n_tensors_moved > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
|
||||
__func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
|
||||
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
|
||||
|
|
@ -960,6 +1362,12 @@ bool llama_model_loader::load_all_data(
|
|||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
if (files.empty()) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
set_tensor_data(t, set_tensor_data_ud);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
||||
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
|
|
|
|||
|
|
@ -4,17 +4,22 @@
|
|||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <unordered_map>
|
||||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
|
|
@ -58,9 +63,10 @@ struct llama_model_loader {
|
|||
}
|
||||
};
|
||||
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
|
||||
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
|
|
@ -84,7 +90,10 @@ struct llama_model_loader {
|
|||
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
||||
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
gguf_context_ptr meta;
|
||||
gguf_context_ptr metadata_ptr;
|
||||
struct gguf_context * metadata; // either metadata_ptr.get() or externally set
|
||||
llama_model_set_tensor_data_t set_tensor_data;
|
||||
void * set_tensor_data_ud;
|
||||
std::vector<ggml_context_ptr> contexts;
|
||||
|
||||
std::string arch_name;
|
||||
|
|
@ -94,7 +103,26 @@ struct llama_model_loader {
|
|||
size_t size_data = 0;
|
||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||
|
||||
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
||||
struct ggml_backend_buft_comparator {
|
||||
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
||||
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
||||
|
||||
// track tensors that had to be moved for debugging:
|
||||
size_t n_tensors_moved = 0;
|
||||
std::string first_tensor_moved_name;
|
||||
std::string first_tensor_moved_type_name;
|
||||
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
||||
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
||||
|
||||
llama_model_loader(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
|
|
@ -149,7 +177,9 @@ struct llama_model_loader {
|
|||
|
||||
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
|
||||
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
|
||||
struct ggml_tensor * create_tensor(
|
||||
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
||||
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
|
||||
|
||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
|
||||
|
||||
|
|
|
|||
|
|
@ -7,14 +7,19 @@
|
|||
#include "llama-model.h"
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
||||
gguf_ctx = gguf_init_empty();
|
||||
}
|
||||
llama_model_saver::llama_model_saver(const struct llama_model * model) :
|
||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
|
||||
|
||||
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
|
||||
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
|
||||
|
||||
llama_model_saver::~llama_model_saver() {
|
||||
gguf_free(gguf_ctx);
|
||||
if (gguf_ctx_owned) {
|
||||
gguf_free(gguf_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
||||
|
|
@ -46,7 +51,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
|||
|
||||
template <typename Container>
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
||||
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
||||
GGML_ASSERT(model != nullptr || !per_layer);
|
||||
const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
|
||||
GGML_ASSERT(n_values <= value.size());
|
||||
|
||||
if (n_values == 0) {
|
||||
|
|
@ -83,6 +89,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// instantiate for external usage:
|
||||
template void llama_model_saver::add_kv<std::vector<uint32_t>>(const enum llm_kv, const std::vector<uint32_t> &, const bool);
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
||||
std::vector<const char *> tmp(value.size());
|
||||
|
|
@ -104,37 +112,39 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
|||
}
|
||||
|
||||
void llama_model_saver::add_kv_from_model() {
|
||||
const llama_hparams & hparams = model.hparams;
|
||||
const llama_vocab & vocab = model.vocab;
|
||||
const llama_hparams & hparams = model->hparams;
|
||||
const llama_vocab & vocab = model->vocab;
|
||||
|
||||
const int32_t n_vocab = vocab.n_tokens();
|
||||
std::vector<std::string> tokens(n_vocab);
|
||||
std::vector<float> scores(n_vocab);
|
||||
std::vector<int32_t> token_types(n_vocab);
|
||||
|
||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||
if (vocab.get_type() != LLAMA_VOCAB_TYPE_NONE) {
|
||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
|
||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model->name);
|
||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_URL, ???);
|
||||
|
|
@ -146,8 +156,8 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
if (hparams.n_embd_out > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
|
||||
if (hparams.n_embd_out_impl > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl);
|
||||
}
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
|
|
@ -176,8 +186,10 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
|
|
@ -189,7 +201,8 @@ void llama_model_saver::add_kv_from_model() {
|
|||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||
|
|
@ -255,24 +268,25 @@ void llama_model_saver::add_kv_from_model() {
|
|||
}
|
||||
|
||||
void llama_model_saver::add_tensors_from_model() {
|
||||
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
||||
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
||||
if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
|
||||
}
|
||||
add_tensor(model.type_embd);
|
||||
add_tensor(model.pos_embd);
|
||||
add_tensor(model.tok_norm);
|
||||
add_tensor(model.tok_norm_b);
|
||||
add_tensor(model.output_norm);
|
||||
add_tensor(model.output_norm_b);
|
||||
add_tensor(model.output);
|
||||
add_tensor(model.output_b);
|
||||
add_tensor(model.output_norm_enc);
|
||||
add_tensor(model.cls);
|
||||
add_tensor(model.cls_b);
|
||||
add_tensor(model.cls_out);
|
||||
add_tensor(model.cls_out_b);
|
||||
add_tensor(model->type_embd);
|
||||
add_tensor(model->pos_embd);
|
||||
add_tensor(model->tok_norm);
|
||||
add_tensor(model->tok_norm_b);
|
||||
add_tensor(model->output_norm);
|
||||
add_tensor(model->output_norm_b);
|
||||
add_tensor(model->output);
|
||||
add_tensor(model->output_b);
|
||||
add_tensor(model->output_norm_enc);
|
||||
add_tensor(model->cls);
|
||||
add_tensor(model->cls_b);
|
||||
add_tensor(model->cls_out);
|
||||
add_tensor(model->cls_out_b);
|
||||
add_tensor(model->cls_norm);
|
||||
|
||||
for (const struct llama_layer & layer : model.layers) {
|
||||
for (const struct llama_layer & layer : model->layers) {
|
||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
#include "llama-arch.h"
|
||||
|
||||
|
|
@ -7,10 +8,12 @@
|
|||
|
||||
struct llama_model_saver {
|
||||
struct gguf_context * gguf_ctx = nullptr;
|
||||
const struct llama_model & model;
|
||||
const bool gguf_ctx_owned;
|
||||
const struct llama_model * model;
|
||||
const struct LLM_KV llm_kv;
|
||||
|
||||
llama_model_saver(const struct llama_model & model);
|
||||
llama_model_saver(const struct llama_model * model);
|
||||
llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx);
|
||||
~llama_model_saver();
|
||||
|
||||
void add_kv(enum llm_kv key, uint32_t value);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -11,6 +11,7 @@
|
|||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
struct llama_cparams;
|
||||
|
|
@ -53,6 +54,7 @@ enum llm_type {
|
|||
LLM_TYPE_0_3B,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_0_6B,
|
||||
LLM_TYPE_0_8B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_2B,
|
||||
LLM_TYPE_1_3B,
|
||||
|
|
@ -115,17 +117,25 @@ enum llm_type {
|
|||
LLM_TYPE_8B_A1B, // lfm2moe
|
||||
LLM_TYPE_16B_A1B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_24B_A2B, // lfm2moe
|
||||
LLM_TYPE_30B_A3B,
|
||||
LLM_TYPE_31B_A3_5B,
|
||||
LLM_TYPE_35B_A3B, // Qwen3.5
|
||||
LLM_TYPE_48B_A3B, // Kimi Linear
|
||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_102B_A12B, // Solar-Open
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
LLM_TYPE_120B_A12B, // Nemotron 3 Super
|
||||
LLM_TYPE_122B_A10B, // Qwen3.5
|
||||
LLM_TYPE_196B_A11B, // Step3.5-Flash
|
||||
LLM_TYPE_230B_A10B, // Minimax M2
|
||||
LLM_TYPE_235B_A22B,
|
||||
LLM_TYPE_300B_A47B, // Ernie MoE big
|
||||
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
|
||||
LLM_TYPE_355B_A32B, // GLM-4.5
|
||||
LLM_TYPE_397B_A17B, // Qwen3.5
|
||||
LLM_TYPE_744B_A40B, // GLM-5
|
||||
LLM_TYPE_E2B,
|
||||
LLM_TYPE_E4B,
|
||||
};
|
||||
|
|
@ -274,14 +284,25 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_up_enc = nullptr;
|
||||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
struct ggml_tensor * ffn_gate_up_exps = nullptr;
|
||||
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
|
||||
|
||||
// ff MoE per-expert scales (NVFP4 per-tensor scale2)
|
||||
struct ggml_tensor * ffn_gate_exps_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_s = nullptr;
|
||||
|
||||
// ff MoE latent proj
|
||||
struct ggml_tensor * ffn_latent_down = nullptr;
|
||||
struct ggml_tensor * ffn_latent_up = nullptr;
|
||||
|
||||
// ff shared expert (shexp)
|
||||
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
||||
|
|
@ -319,6 +340,9 @@ struct llama_layer {
|
|||
// qwen3next
|
||||
struct ggml_tensor * ssm_beta_alpha = nullptr;
|
||||
|
||||
// qwen3.5
|
||||
struct ggml_tensor * ssm_alpha = nullptr;
|
||||
|
||||
// rwkv
|
||||
struct ggml_tensor * time_mix_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_w2 = nullptr;
|
||||
|
|
@ -373,13 +397,21 @@ struct llama_layer {
|
|||
struct ggml_tensor * rope_freqs = nullptr;
|
||||
|
||||
// bitnet scale
|
||||
struct ggml_tensor * wq_scale = nullptr;
|
||||
struct ggml_tensor * wk_scale = nullptr;
|
||||
struct ggml_tensor * wv_scale = nullptr;
|
||||
struct ggml_tensor * wo_scale = nullptr;
|
||||
struct ggml_tensor * ffn_gate_scale = nullptr;
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
struct ggml_tensor * wq_s = nullptr;
|
||||
struct ggml_tensor * wk_s = nullptr;
|
||||
struct ggml_tensor * wv_s = nullptr;
|
||||
struct ggml_tensor * wo_s = nullptr;
|
||||
struct ggml_tensor * wqkv_s = nullptr;
|
||||
struct ggml_tensor * wqkv_gate_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_shexp_s = nullptr;
|
||||
struct ggml_tensor * ssm_out_s = nullptr;
|
||||
struct ggml_tensor * ssm_alpha_s = nullptr;
|
||||
struct ggml_tensor * ssm_beta_s = nullptr;
|
||||
|
||||
// altup & laurel
|
||||
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
||||
|
|
@ -410,6 +442,25 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_act_beta = nullptr;
|
||||
struct ggml_tensor * ffn_act_eps = nullptr;
|
||||
|
||||
// Kimi Linear KDA (using ssm_ prefix for consistency)
|
||||
// Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
|
||||
struct ggml_tensor * ssm_q_conv = nullptr;
|
||||
struct ggml_tensor * ssm_k_conv = nullptr;
|
||||
struct ggml_tensor * ssm_v_conv = nullptr;
|
||||
struct ggml_tensor * ssm_f_a = nullptr;
|
||||
struct ggml_tensor * ssm_f_b = nullptr;
|
||||
struct ggml_tensor * ssm_beta = nullptr;
|
||||
struct ggml_tensor * ssm_g_a = nullptr;
|
||||
struct ggml_tensor * ssm_g_b = nullptr;
|
||||
struct ggml_tensor * ssm_o_norm = nullptr;
|
||||
|
||||
// DSA (deepseek sparse attention)
|
||||
struct ggml_tensor * indexer_k_norm = nullptr;
|
||||
struct ggml_tensor * indexer_k_norm_b = nullptr;
|
||||
struct ggml_tensor * indexer_proj = nullptr;
|
||||
struct ggml_tensor * indexer_attn_k = nullptr;
|
||||
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
||||
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
|
|
@ -448,6 +499,7 @@ struct llama_model {
|
|||
struct ggml_tensor * cls_b = nullptr;
|
||||
struct ggml_tensor * cls_out = nullptr;
|
||||
struct ggml_tensor * cls_out_b = nullptr;
|
||||
struct ggml_tensor * cls_norm = nullptr;
|
||||
|
||||
struct ggml_tensor * conv1d = nullptr;
|
||||
struct ggml_tensor * conv1d_b = nullptr;
|
||||
|
|
@ -464,8 +516,9 @@ struct llama_model {
|
|||
//Dense linear projections for SentenceTransformers models like embeddinggemma
|
||||
// For Sentence Transformers models structure see
|
||||
// https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
|
||||
struct ggml_tensor * dense_2_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_3_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_2_out_layers = nullptr;
|
||||
struct ggml_tensor * dense_2_out_layers_b = nullptr;
|
||||
struct ggml_tensor * dense_3_out_layers = nullptr;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
|
@ -476,8 +529,8 @@ struct llama_model {
|
|||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
// for keeping track of extra nodes used by lora adapters
|
||||
uint32_t n_lora_nodes = 0;
|
||||
// for keeping track of associated LoRA adapters
|
||||
std::unordered_set<llama_adapter_lora *> loras;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,4 +1,4 @@
|
|||
#include "llama-sampling.h"
|
||||
#include "llama-sampler.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-vocab.h"
|
||||
|
|
@ -1025,11 +1025,7 @@ struct llama_sampler_dist : public llama_sampler_backend {
|
|||
|
||||
std::mt19937 rng;
|
||||
|
||||
// backend input
|
||||
struct ggml_tensor * inp_uniform;
|
||||
|
||||
ggml_context_ptr inp_ctx;
|
||||
ggml_backend_buffer_ptr inp_buf;
|
||||
ggml_tensor * inp_uniform;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) {
|
||||
|
|
@ -1138,37 +1134,10 @@ static bool llama_sampler_dist_backend_init(
|
|||
ggml_backend_buffer_type_t buft) {
|
||||
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
// allocate inputs
|
||||
{
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
sctx->inp_ctx.reset(ggml_init(params));
|
||||
|
||||
// Create the uniform random scalar input tensor. This will be set by
|
||||
// llama_sampler_dist_backend_set_input after this graph is built.
|
||||
sctx->inp_uniform = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1);
|
||||
ggml_set_name (sctx->inp_uniform, "uniform");
|
||||
ggml_set_input(sctx->inp_uniform);
|
||||
|
||||
// Allocate all tensors from our context to the backend
|
||||
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
||||
|
||||
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
||||
}
|
||||
|
||||
const bool res = llama_sampler_backend_support(smpl, buft);
|
||||
|
||||
sctx->init(res);
|
||||
|
||||
if (!res) {
|
||||
sctx->inp_ctx.reset(nullptr);
|
||||
sctx->inp_buf.reset(nullptr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
@ -1178,8 +1147,13 @@ static void llama_sampler_dist_backend_apply(
|
|||
struct ggml_cgraph * gf,
|
||||
struct llama_sampler_data * data) {
|
||||
GGML_UNUSED(gf);
|
||||
|
||||
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||
ggml_set_name (sctx->inp_uniform, "uniform");
|
||||
ggml_set_input(sctx->inp_uniform);
|
||||
|
||||
struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
|
||||
ggml_set_name(probs, "dist_probs");
|
||||
|
||||
|
|
@ -1226,6 +1200,7 @@ static void llama_sampler_dist_backend_apply(
|
|||
|
||||
static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) {
|
||||
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
GGML_ASSERT(sctx->inp_uniform != nullptr);
|
||||
|
||||
// We sample in double precision and cast to float to match rnd numbers of
|
||||
|
|
@ -1262,8 +1237,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
|||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
/* .inp_uniform = */ nullptr,
|
||||
/* .inp_ctx = */ nullptr,
|
||||
/* .inp_buf = */ nullptr,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
|
@ -1513,12 +1486,9 @@ static void llama_sampler_top_p_backend_apply(
|
|||
mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
|
||||
mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
|
||||
|
||||
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
||||
// top_p_bias = (mask * 1e9f) - 1e9f.
|
||||
// So entries in the mask that we want to discard will become -1e9f, and
|
||||
// others will be 0 (meaning that will not effect the logits).
|
||||
const float large_val = 1e9f;
|
||||
struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
||||
// Apply -INFINITY bias for masked-out tokens
|
||||
// log(1) = 0 (keep), log(0) = -INF (discard)
|
||||
struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
|
||||
ggml_set_name(top_p_bias, "top_p_bias");
|
||||
|
||||
data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
|
||||
|
|
@ -1673,15 +1643,11 @@ static void llama_sampler_min_p_backend_apply(
|
|||
struct ggml_tensor * mask = ggml_step(ctx, sub);
|
||||
ggml_set_name(mask, "min_p_mask");
|
||||
|
||||
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
||||
// min_p_bias = (mask * 1e9f) - 1e9f.
|
||||
// So entries in the mask that we want to discard will become -1e9f, and
|
||||
// others will be 0 (meaning that will not effect the logits).
|
||||
const float large_val = 1e9f;
|
||||
struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
||||
// Apply -INFINITY bias for masked-out tokens
|
||||
// log(1) = 0 (keep), log(0) = -INF (discard)
|
||||
struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
|
||||
ggml_set_name(min_p_bias, "min_p_bias");
|
||||
|
||||
// Add the min_p bias to the logits.
|
||||
data->logits = ggml_add(ctx, data->logits, min_p_bias);
|
||||
ggml_set_name(data->logits, "min_p_logits");
|
||||
|
||||
|
|
@ -3293,6 +3259,170 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
|
|||
return result;
|
||||
}
|
||||
|
||||
// adaptive-p sampler state
|
||||
//
|
||||
// maintains an exponential moving average of the *ORIGINAL* probabilities
|
||||
// of selected tokens, used to compute an adapted target at each sampling step.
|
||||
//
|
||||
// see llama.h for a full description of the sampler
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
||||
//
|
||||
struct llama_sampler_adaptive_p {
|
||||
const float target; // target probability (0.0 - 1.0; negative = disabled)
|
||||
const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
|
||||
const uint32_t seed; // original RNG seed
|
||||
uint32_t seed_cur; // actual RNG seed
|
||||
std::mt19937 rng; // RNG state
|
||||
float weighted_sum; // sum(p_i * decay^i)
|
||||
float total_weight; // sum(decay^i), converges to 1/(1-decay)
|
||||
std::vector<float> original_probs; // pre-transform probs, cached for EMA update
|
||||
llama_token pending_token_id; // token ID of selected token
|
||||
int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
|
||||
};
|
||||
|
||||
// adaptive probability transformation constants
|
||||
static constexpr float DISTRIBUTION_WIDTH = 0.3f;
|
||||
static constexpr float PEAK_LOGIT_VALUE = 5.0f;
|
||||
static constexpr float SHARPNESS = 10.0f;
|
||||
static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
|
||||
|
||||
static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
|
||||
return "adaptive-p";
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p, false);
|
||||
|
||||
if (ctx->target < 0.0f) {
|
||||
// at negative target values, adaptive-p is no-op
|
||||
// we simply sample from the existing distribution
|
||||
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
||||
return;
|
||||
}
|
||||
|
||||
// store the original probabilities
|
||||
ctx->original_probs.resize(cur_p->size);
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
ctx->original_probs[i] = cur_p->data[i].p;
|
||||
}
|
||||
|
||||
// using the EMA, compute the adapted target probability for the current sampling step
|
||||
auto target = std::clamp(ctx->target, 0.0f, 1.0f);
|
||||
float adapted_target = std::clamp(
|
||||
ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
|
||||
0.0f, 1.0f
|
||||
);
|
||||
|
||||
// adaptive probability transform
|
||||
//
|
||||
// quadratic near target for fine differentiation, transitioning to linear decay in the
|
||||
// tails. unbounded negative logits ensure proper suppression of far-from-target tokens
|
||||
// after the softmax.
|
||||
//
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit == -INFINITY) {
|
||||
// don't transform logits that are -INFINITY
|
||||
// (as masked out by e.g. min-p and top-p when using backend sampling)
|
||||
continue;
|
||||
}
|
||||
float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
|
||||
cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
|
||||
}
|
||||
|
||||
// softmax and sample from the transformed distribution
|
||||
llama_sampler_softmax_impl(cur_p, false);
|
||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||
cur_p->selected = idx;
|
||||
|
||||
// store the selected token ID for acceptance later
|
||||
ctx->pending_token_id = cur_p->data[idx].id;
|
||||
ctx->pending_token_idx = idx;
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
|
||||
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
if (ctx->pending_token_id == token) {
|
||||
GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
|
||||
GGML_ASSERT(ctx->pending_token_idx != -1);
|
||||
// update EMA with the original probability of the selected token
|
||||
ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
|
||||
ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
|
||||
}
|
||||
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
||||
ctx->pending_token_idx = -1;
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
// ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
|
||||
// original_probs is completely overwritten on every call to _apply.
|
||||
// so we only need to reset the EMA state and pending token.
|
||||
ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
|
||||
ctx->total_weight = 1.0f / (1.0f - ctx->decay);
|
||||
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
||||
ctx->pending_token_idx = -1;
|
||||
ctx->seed_cur = get_rng_seed(ctx->seed);
|
||||
ctx->rng.seed(ctx->seed_cur);
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
|
||||
auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
|
||||
auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
|
||||
|
||||
// copy everything (target, decay, seed, and RNG are already set)
|
||||
result_ctx->weighted_sum = ctx->weighted_sum;
|
||||
result_ctx->total_weight = ctx->total_weight;
|
||||
result_ctx->pending_token_id = ctx->pending_token_id;
|
||||
result_ctx->pending_token_idx = ctx->pending_token_idx;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
|
||||
delete (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_adaptive_p_i = {
|
||||
/* .name = */ llama_sampler_adaptive_p_name,
|
||||
/* .accept = */ llama_sampler_adaptive_p_accept,
|
||||
/* .apply = */ llama_sampler_adaptive_p_apply,
|
||||
/* .reset = */ llama_sampler_adaptive_p_reset,
|
||||
/* .clone = */ llama_sampler_adaptive_p_clone,
|
||||
/* .free = */ llama_sampler_adaptive_p_free,
|
||||
/* .backend_init = */ nullptr,
|
||||
/* .backend_accept = */ nullptr,
|
||||
/* .backend_apply = */ nullptr,
|
||||
/* .backend_set_input = */ nullptr,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_adaptive_p(
|
||||
float target,
|
||||
float decay,
|
||||
uint32_t seed
|
||||
) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_adaptive_p_i,
|
||||
/* .ctx = */ new llama_sampler_adaptive_p {
|
||||
/* .target = */ target,
|
||||
/* .decay = */ clamped_decay,
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
/* .weighted_sum = */ target / (1.0f - clamped_decay),
|
||||
/* .total_weight = */ 1.0f / (1.0f - clamped_decay),
|
||||
/* .original_probs = */ {},
|
||||
/* .pending_token_id = */ LLAMA_TOKEN_NULL,
|
||||
/* .pending_token_idx = */ -1
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// logit-bias
|
||||
|
||||
struct llama_sampler_logit_bias : public llama_sampler_backend {
|
||||
|
|
@ -3304,9 +3434,6 @@ struct llama_sampler_logit_bias : public llama_sampler_backend {
|
|||
|
||||
struct ggml_tensor * inp_logit_bias;
|
||||
struct ggml_tensor * inp_logit_idxs;
|
||||
|
||||
ggml_context_ptr inp_ctx;
|
||||
ggml_backend_buffer_ptr inp_buf;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) {
|
||||
|
|
@ -3369,6 +3496,16 @@ static void llama_sampler_logit_bias_backend_apply(
|
|||
return;
|
||||
}
|
||||
|
||||
const size_t n = sctx->logit_bias.size();
|
||||
|
||||
sctx->inp_logit_bias = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n);
|
||||
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
|
||||
ggml_set_input(sctx->inp_logit_bias);
|
||||
|
||||
sctx->inp_logit_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n);
|
||||
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
|
||||
ggml_set_input(sctx->inp_logit_idxs);
|
||||
|
||||
ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f);
|
||||
|
||||
cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur));
|
||||
|
|
@ -3405,6 +3542,8 @@ static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * sm
|
|||
static bool llama_sampler_logit_bias_backend_init(
|
||||
struct llama_sampler * smpl,
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(buft);
|
||||
|
||||
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
||||
|
||||
sctx->init(true);
|
||||
|
|
@ -3413,29 +3552,6 @@ static bool llama_sampler_logit_bias_backend_init(
|
|||
return true;
|
||||
}
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ 2*ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
sctx->inp_ctx.reset(ggml_init(params));
|
||||
|
||||
const size_t n = sctx->logit_bias.size();
|
||||
|
||||
sctx->inp_logit_bias = ggml_new_tensor_2d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1, n);
|
||||
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
|
||||
ggml_set_input(sctx->inp_logit_bias);
|
||||
|
||||
sctx->inp_logit_idxs = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_I32, n);
|
||||
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
|
||||
ggml_set_input(sctx->inp_logit_idxs);
|
||||
|
||||
// Allocate all tensors from our context to the backend
|
||||
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
||||
|
||||
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -3471,8 +3587,6 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
|||
/* .to_search = */ {},
|
||||
/* .inp_logit_bias = */ nullptr,
|
||||
/* .inp_logit_idxs = */ nullptr,
|
||||
/* .inp_ctx = */ nullptr,
|
||||
/* .inp_buf = */ nullptr,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
|
@ -1,7 +1,5 @@
|
|||
#pragma once
|
||||
|
||||
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <vector>
|
||||
|
|
@ -90,7 +90,7 @@ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not
|
|||
//
|
||||
// SPM tokenizer
|
||||
// original implementation:
|
||||
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
||||
// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
||||
//
|
||||
|
||||
struct llm_bigram_spm {
|
||||
|
|
@ -285,10 +285,19 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
// adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_JAIS2:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s{512}(?!\\S)|\\s{256}(?!\\S)|\\s{128}(?!\\S)|\\s{64}(?!\\S)|\\s{32}(?!\\S)|\\s{16}(?!\\S)|\\s{8}(?!\\S)|\\s{4}(?!\\S)|\\s{1,2}(?!\\S)|\\s{1}",
|
||||
|
||||
// adapted: same as llama3 but with cascading whitespace pattern
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s{512}(?!\\S)|\\s{256}(?!\\S)|\\s{128}(?!\\S)|\\s{64}(?!\\S)|\\s{32}(?!\\S)|\\s{16}(?!\\S)|\\s{8}(?!\\S)|\\s{4}(?!\\S)|\\s{1,2}(?!\\S)|\\s{1}",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
regex_exprs = {
|
||||
|
|
@ -308,6 +317,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
||||
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
|
||||
case LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM:
|
||||
regex_exprs = {
|
||||
"\\p{N}{1,3}",
|
||||
"[一-龥-ゟ゠-ヿ]+",
|
||||
|
|
@ -368,6 +378,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_QWEN35:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
||||
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
||||
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
||||
|
|
@ -415,6 +432,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_TINY_AYA:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)"
|
||||
"\\d{1,3}(?=(?:\\d{3})*\\b)",
|
||||
// original regex from tokenizer.json: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
|
||||
regex_exprs = {
|
||||
// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
|
||||
|
|
@ -461,6 +486,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
|
@ -1687,7 +1719,7 @@ private:
|
|||
};
|
||||
|
||||
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
struct gguf_context * ctx = ml.meta.get();
|
||||
struct gguf_context * ctx = ml.metadata;
|
||||
|
||||
// determine vocab type
|
||||
{
|
||||
|
|
@ -1745,26 +1777,33 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|
||||
// read bpe merges and populate bpe ranks
|
||||
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
||||
// Kimi-K2 uses custom tokenization without traditional BPE merges
|
||||
const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2");
|
||||
|
||||
if (merges_keyidx == -1) {
|
||||
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||||
}
|
||||
|
||||
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||||
for (int i = 0; i < n_merges; i++) {
|
||||
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||||
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
||||
|
||||
std::string first;
|
||||
std::string second;
|
||||
|
||||
const size_t pos = word.find(' ', 1);
|
||||
|
||||
if (pos != std::string::npos) {
|
||||
first = word.substr(0, pos);
|
||||
second = word.substr(pos + 1);
|
||||
if (!is_kimi_k2) {
|
||||
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||||
}
|
||||
// Kimi-K2 doesn't need merges, skip
|
||||
LLAMA_LOG_INFO("%s: Kimi-K2 tokenizer detected, skipping BPE merges\n", __func__);
|
||||
} else {
|
||||
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||||
for (int i = 0; i < n_merges; i++) {
|
||||
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||||
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
||||
|
||||
bpe_ranks.emplace(std::make_pair(first, second), i);
|
||||
std::string first;
|
||||
std::string second;
|
||||
|
||||
const size_t pos = word.find(' ', 1);
|
||||
|
||||
if (pos != std::string::npos) {
|
||||
first = word.substr(0, pos);
|
||||
second = word.substr(pos + 1);
|
||||
}
|
||||
|
||||
bpe_ranks.emplace(std::make_pair(first, second), i);
|
||||
}
|
||||
}
|
||||
|
||||
// default special tokens
|
||||
|
|
@ -1794,7 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
// correct endiannes of data in precompiled_charsmap binary blob
|
||||
// correct endianness of data in precompiled_charsmap binary blob
|
||||
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
||||
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||||
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
|
||||
|
|
@ -1851,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "falcon-h1" ||
|
||||
tokenizer_pre == "pixtral" ||
|
||||
tokenizer_pre == "midm-2.0" ||
|
||||
tokenizer_pre == "lfm2") {
|
||||
tokenizer_pre == "lfm2" ||
|
||||
tokenizer_pre == "jina-v5-nano") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||
ignore_merges = true;
|
||||
add_bos = true;
|
||||
|
|
@ -1891,8 +1931,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "jina-v2-de" ||
|
||||
tokenizer_pre == "a.x-4.0" ||
|
||||
tokenizer_pre == "mellum" ||
|
||||
tokenizer_pre == "modern-bert" ) {
|
||||
tokenizer_pre == "modern-bert") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "jais-2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-code" ||
|
||||
|
|
@ -1912,6 +1955,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "kormo") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "qwen35") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN35;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "stablelm2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
||||
|
|
@ -1965,6 +2012,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
} else if (
|
||||
tokenizer_pre == "exaone4") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "exaone-moe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
|
||||
} else if (
|
||||
tokenizer_pre == "chameleon") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
||||
|
|
@ -1977,10 +2027,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "megrez") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
} else if (
|
||||
tokenizer_pre == "gpt-4o" ||
|
||||
tokenizer_pre == "llama4") {
|
||||
tokenizer_pre == "gpt-4o" ||
|
||||
tokenizer_pre == "llama4" ||
|
||||
tokenizer_pre == "kanana2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "tiny_aya") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "superbpe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
|
||||
|
|
@ -2011,6 +2066,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "hunyuan-dense") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "joyai-llm") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "kimi-k2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
||||
|
|
@ -2216,6 +2275,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<|end_of_text|>" // granite
|
||||
|| t.first == "<EOT>"
|
||||
|| t.first == "_<EOT>"
|
||||
|| t.first == "[EOT]" // Kimi-K2
|
||||
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
||||
|| t.first == "<end_of_utterance>" // smoldocling
|
||||
) {
|
||||
|
|
@ -2252,6 +2312,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<PRE>"
|
||||
|| t.first == "▁<PRE>" // CodeLlama
|
||||
|| t.first == "<|code_prefix|>" // GLM-4.5
|
||||
|| t.first == "<|prefix|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_pre_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
|
@ -2272,6 +2333,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<SUF>"
|
||||
|| t.first == "▁<SUF>" // CodeLlama
|
||||
|| t.first == "<|code_suffix|>" // GLM-4.5
|
||||
|| t.first == "<|suffix|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_suf_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
|
@ -2292,6 +2354,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<MID>"
|
||||
|| t.first == "▁<MID>" // CodeLlama
|
||||
|| t.first == "<|code_middle|>" // GLM-4.5
|
||||
|| t.first == "<|middle|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_mid_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
|
@ -2309,6 +2372,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<fim-pad>"
|
||||
|| t.first == "<fim_pad>" // Granite
|
||||
|| t.first == "<PAD>"
|
||||
|| t.first == "[PAD]" // Kimi-K2
|
||||
) {
|
||||
special_fim_pad_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
|
@ -2380,7 +2444,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|
||||
// maintain a list of tokens that cause end-of-generation
|
||||
// this is currently determined based on the token text, which is obviously not ideal
|
||||
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/9606
|
||||
special_eog_ids.clear();
|
||||
|
||||
if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
|
||||
|
|
@ -2408,9 +2472,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<|calls|>" // solar-open
|
||||
|| t.first == "<end_of_turn>"
|
||||
|| t.first == "<|endoftext|>"
|
||||
|| t.first == "</s>" // paddleocr
|
||||
|| t.first == "<|eom_id|>"
|
||||
|| t.first == "<EOT>"
|
||||
|| t.first == "_<EOT>"
|
||||
|| t.first == "[EOT]" // Kimi-K2
|
||||
|| t.first == "[EOS]" // Kimi-K2
|
||||
|| t.first == "<|end_of_text|>"
|
||||
|| t.first == "<end_of_utterance>" // smoldocling
|
||||
) {
|
||||
|
|
@ -2436,7 +2503,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
auto & attr = id_to_token[t.second].attr;
|
||||
|
||||
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
|
||||
LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
|
||||
__func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
|
||||
|
||||
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2489,7 +2559,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
special_eog_ids.erase(end_id);
|
||||
|
||||
auto & attr = id_to_token[end_id].attr;
|
||||
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
|
||||
attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
|
||||
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
||||
}
|
||||
|
|
@ -3066,7 +3136,7 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
|||
}
|
||||
|
||||
int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
|
||||
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
|
||||
const llama_token_attr attr = token_get_attr(token);
|
||||
if (!special && (attr & attr_special)) {
|
||||
|
|
@ -3289,34 +3359,34 @@ int32_t llama_vocab::impl::detokenize(
|
|||
}
|
||||
|
||||
void llama_vocab::impl::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
|
||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
|
||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
||||
|
||||
// special tokens
|
||||
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
||||
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
||||
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
||||
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
||||
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
||||
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
||||
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
||||
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
||||
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
||||
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
||||
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
||||
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
||||
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
||||
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
||||
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
||||
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
||||
|
||||
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
||||
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
||||
|
||||
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
||||
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
||||
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
||||
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
||||
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
||||
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
||||
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
||||
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
||||
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
||||
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
||||
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
||||
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
||||
|
||||
for (const auto & id : special_eog_ids) {
|
||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
||||
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
||||
}
|
||||
|
||||
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
||||
|
|
|
|||
|
|
@ -53,6 +53,11 @@ enum llama_vocab_pre_type {
|
|||
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
|
||||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
|
||||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "llama-chat.h"
|
||||
|
|
@ -12,6 +13,7 @@
|
|||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
|
@ -311,8 +313,12 @@ static void llama_params_fit_impl(
|
|||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
if (n_ctx_min == UINT32_MAX) {
|
||||
LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
|
|
@ -821,7 +827,8 @@ int64_t llama_time_us(void) {
|
|||
}
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
||||
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
// loading time will be recalculated after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = 0;
|
||||
|
|
@ -830,7 +837,8 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
|
@ -876,9 +884,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
}
|
||||
|
||||
static struct llama_model * llama_model_load_from_file_impl(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & path_model,
|
||||
std::vector<std::string> & splits,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
||||
ggml_time_init();
|
||||
|
||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||
|
|
@ -999,7 +1011,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(path_model, splits, *model, params);
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
if (status == -1) {
|
||||
|
|
@ -1015,6 +1027,18 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
return model;
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_init_from_user(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT(metadata != nullptr);
|
||||
std::string path_model;
|
||||
std::vector<std::string> splits = {};
|
||||
params.use_mmap = false;
|
||||
params.use_extra_bufts = false;
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||
}
|
||||
// deprecated
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
|
|
@ -1026,7 +1050,7 @@ struct llama_model * llama_model_load_from_file(
|
|||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_splits(
|
||||
|
|
@ -1042,11 +1066,11 @@ struct llama_model * llama_model_load_from_splits(
|
|||
for (size_t i = 0; i < n_paths; ++i) {
|
||||
splits.push_back(paths[i]);
|
||||
}
|
||||
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
||||
}
|
||||
|
||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||
llama_model_saver ms(*model);
|
||||
llama_model_saver ms(model);
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(path_model);
|
||||
|
|
@ -1091,25 +1115,55 @@ int32_t llama_chat_apply_template(
|
|||
// model split
|
||||
//
|
||||
|
||||
int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
||||
int32_t llama_split_path(
|
||||
char * split_path,
|
||||
size_t maxlen,
|
||||
const char * path_prefix,
|
||||
int32_t split_no,
|
||||
int32_t split_count) {
|
||||
|
||||
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
||||
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
||||
return strlen(split_path);
|
||||
|
||||
const int written = snprintf(
|
||||
split_path,
|
||||
maxlen,
|
||||
SPLIT_PATH_FORMAT,
|
||||
path_prefix,
|
||||
split_no + 1,
|
||||
split_count
|
||||
);
|
||||
|
||||
if (written < 0 || (size_t) written >= maxlen) {
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
|
||||
return (int32_t) written;
|
||||
}
|
||||
|
||||
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||
std::string str_split_path(split_path);
|
||||
char postfix[32];
|
||||
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||
std::string str_postfix(postfix);
|
||||
int32_t llama_split_prefix(
|
||||
char * split_prefix,
|
||||
size_t maxlen,
|
||||
const char * split_path,
|
||||
int32_t split_no,
|
||||
int32_t split_count) {
|
||||
|
||||
// check if split_prefix ends with postfix
|
||||
int size_prefix = str_split_path.size() - str_postfix.size();
|
||||
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
||||
snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||
return size_prefix;
|
||||
const std::string str_split_path(split_path);
|
||||
|
||||
char postfix[32];
|
||||
snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||
|
||||
const std::string str_postfix(postfix);
|
||||
if (str_split_path.size() <= str_postfix.size()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const size_t size_prefix = str_split_path.size() - str_postfix.size();
|
||||
|
||||
if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
|
||||
const size_t copy_len = std::min(size_prefix + 1, maxlen);
|
||||
snprintf(split_prefix, copy_len, "%s", split_path);
|
||||
|
||||
return (int32_t) size_prefix;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-opt.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
|
@ -152,6 +153,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
|
@ -309,7 +311,7 @@ extern "C" {
|
|||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_direct_io; // use direct io, takes precedence over use_mmap
|
||||
bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
|
|
@ -389,6 +391,7 @@ extern "C" {
|
|||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
void * tensor_types; // pointer to vector containing tensor types
|
||||
|
|
@ -439,19 +442,30 @@ extern "C" {
|
|||
|
||||
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||
|
||||
typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// Create a new model from GGUF metadata as well as a function to set the tensor data
|
||||
// - tensors are created as GGML_TYPE_F32 by default,
|
||||
// override by adding a tensor with the same name but a different name to the context
|
||||
LLAMA_API struct llama_model * llama_model_init_from_user(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
|
||||
void * set_tensor_data_ud, // userdata for function
|
||||
struct llama_model_params params);
|
||||
|
||||
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params),
|
||||
"use llama_model_load_from_file instead");
|
||||
|
||||
// Load the model from a file
|
||||
// Load a model from a file
|
||||
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
|
||||
// If the split file name does not follow this pattern, use llama_model_load_from_splits
|
||||
LLAMA_API struct llama_model * llama_model_load_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load the model from multiple splits (support custom naming scheme)
|
||||
// Load a model from multiple splits (support custom naming scheme)
|
||||
// The paths must be in the correct order
|
||||
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
||||
const char ** paths,
|
||||
|
|
@ -482,13 +496,14 @@ extern "C" {
|
|||
enum llama_params_fit_status {
|
||||
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
|
||||
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
|
|
@ -646,7 +661,8 @@ extern "C" {
|
|||
|
||||
// Manually free a LoRA adapter
|
||||
// NOTE: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
|
||||
"adapters are now freed together with the associated model");
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||
|
|
@ -654,21 +670,12 @@ extern "C" {
|
|||
|
||||
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
||||
|
||||
// Add a loaded LoRA adapter to given context
|
||||
// This will not modify model's weight
|
||||
LLAMA_API int32_t llama_set_adapter_lora(
|
||||
// Set LoRa adapters on the context. Will only modify if the adapters currently in context are different.
|
||||
LLAMA_API int32_t llama_set_adapters_lora(
|
||||
struct llama_context * ctx,
|
||||
struct llama_adapter_lora * adapter,
|
||||
float scale);
|
||||
|
||||
// Remove a specific LoRA adapter from given context
|
||||
// Return -1 if the adapter is not present in the context
|
||||
LLAMA_API int32_t llama_rm_adapter_lora(
|
||||
struct llama_context * ctx,
|
||||
struct llama_adapter_lora * adapter);
|
||||
|
||||
// Remove all LoRA adapters from given context
|
||||
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
|
||||
struct llama_adapter_lora ** adapters,
|
||||
size_t n_adapters,
|
||||
float * scales);
|
||||
|
||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
// the currently loaded vector.
|
||||
|
|
@ -676,7 +683,7 @@ extern "C" {
|
|||
// to an n_embd x n_layers buffer starting from layer 1.
|
||||
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
// See llama_control_vector_load in common to load a control vector.
|
||||
LLAMA_API int32_t llama_apply_adapter_cvec(
|
||||
LLAMA_API int32_t llama_set_adapter_cvec(
|
||||
struct llama_context * ctx,
|
||||
const float * data,
|
||||
size_t len,
|
||||
|
|
@ -979,7 +986,7 @@ extern "C" {
|
|||
|
||||
// Logits for the ith token. For positive indices, Equivalent to:
|
||||
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
||||
// Negative indices can be used to access logits in reverse order, -1 is the last logit.
|
||||
// returns NULL for invalid ids.
|
||||
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
|
|
@ -994,7 +1001,7 @@ extern "C" {
|
|||
|
||||
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||
// Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
// returns NULL for invalid ids.
|
||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
|
|
@ -1014,9 +1021,9 @@ extern "C" {
|
|||
// Returns LLAMA_TOKEN_NULL if no token was sampled.
|
||||
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Get the backend sampled probabilites for the ith token
|
||||
// Get the backend sampled probabilities for the ith token
|
||||
// The index matches llama_get_sampled_token_ith().
|
||||
// Returns NULL if no probabilites were generated.
|
||||
// Returns NULL if no probabilities were generated.
|
||||
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
|
|
@ -1148,9 +1155,9 @@ extern "C" {
|
|||
//
|
||||
|
||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
///
|
||||
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||
/// @param tmpl A Jinja template to use for this chat.
|
||||
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||
/// @param n_msg Number of llama_chat_message in this chat
|
||||
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
||||
|
|
@ -1255,7 +1262,6 @@ extern "C" {
|
|||
// [EXPERIMENTAL]
|
||||
// attach a sampler to the context
|
||||
// note: prefer initializing the context with llama_context_params.samplers when possible
|
||||
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
||||
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
||||
|
||||
// mirror of llama_sampler_i:
|
||||
|
|
@ -1344,7 +1350,7 @@ extern "C" {
|
|||
float tau,
|
||||
float eta);
|
||||
|
||||
/// @details Intializes a GBNF grammar, see grammars/README.md for details.
|
||||
/// @details Initializes a GBNF grammar, see grammars/README.md for details.
|
||||
/// @param vocab The vocabulary that this grammar will be used with.
|
||||
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
|
||||
/// @param grammar_root The name of the start symbol for the grammar.
|
||||
|
|
@ -1395,6 +1401,33 @@ extern "C" {
|
|||
const char ** seq_breakers,
|
||||
size_t num_breakers);
|
||||
|
||||
/// adaptive-p: select tokens near a configurable target probability over time.
|
||||
///
|
||||
/// the adaptive-p sampler transforms the token probability distribution to favor tokens
|
||||
/// that fall near a user-configurable probability target.
|
||||
///
|
||||
/// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
|
||||
/// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
|
||||
/// adapted target probability at each sampling step, thus maintaining the desired target
|
||||
/// probability over time.
|
||||
///
|
||||
/// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
|
||||
/// in the sampler chain (like mirostat, dist, greedy).
|
||||
///
|
||||
/// only mild truncation before this sampler is recommended. we suggest applying min-p
|
||||
/// before adaptive-p as the only other active sampler in the chain.
|
||||
///
|
||||
/// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
||||
/// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
|
||||
/// @param seed RNG seed
|
||||
///
|
||||
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
||||
///
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
|
||||
float target,
|
||||
float decay,
|
||||
uint32_t seed);
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
int32_t n_vocab,
|
||||
int32_t n_logit_bias,
|
||||
|
|
@ -1448,12 +1481,12 @@ extern "C" {
|
|||
/// @details Build a split GGUF final path for this chunk.
|
||||
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||
// Returns the split_path length.
|
||||
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
||||
LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
|
||||
|
||||
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
||||
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
||||
// Returns the split_prefix length.
|
||||
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||
LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
|
||||
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU,
|
||||
hparams.expert_weights_norm, // norm_w (route_norm=True)
|
||||
hparams.expert_weights_scale, // scale_w
|
||||
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -56,6 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
|||
);
|
||||
break;
|
||||
case LLM_TYPE_13B:
|
||||
case LLM_TYPE_UNKNOWN:
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
false, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,12 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
|||
// feed-forward network
|
||||
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
||||
// MoE branch
|
||||
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
|
||||
model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
|
||||
LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
cur = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
nullptr,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
hparams.n_expert, hparams.n_expert_used,
|
||||
LLM_FFN_GELU, false,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
model.arch == LLM_ARCH_JINA_BERT_V3) {
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@
|
|||
|
||||
|
||||
llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].wq_scale) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||
}
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
|
|
@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
}
|
||||
|
||||
// B1.K
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].wk_scale) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||
}
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
|
|
@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
}
|
||||
|
||||
// B1.V
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].wv_scale) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||
}
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
|
|
@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].wo, cur);
|
||||
if (model.layers[il].wo_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||
}
|
||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||
if (model.layers[il].bo) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||
}
|
||||
|
|
@ -115,8 +103,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
|
||||
NULL, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
|
|
@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].ffn_down, cur);
|
||||
if (model.layers[il].ffn_down_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||
}
|
||||
cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
|
||||
cb(cur, "ffn_down", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
#include <float.h>
|
||||
|
||||
llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * cur;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@
|
|||
|
||||
llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -2,22 +2,19 @@
|
|||
|
||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||
|
||||
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
||||
const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
||||
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
||||
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
// See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
|
||||
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
|
||||
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
|
||||
|
|
@ -43,11 +40,13 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr;
|
||||
auto * inp_attn_k = is_mla ? build_attn_inp_k() : nullptr;
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < effective_n_layers; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
|
@ -57,6 +56,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
// self_attention
|
||||
{
|
||||
ggml_tensor * q = NULL;
|
||||
|
||||
const bool is_lite = model.layers[il].wq;
|
||||
|
||||
if (!is_lite) {
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
||||
cb(q, "q", il);
|
||||
|
|
@ -124,14 +126,14 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
|
||||
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
|
||||
// note: rope must go first for in-place context shifting in build_rope_shift()
|
||||
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
|
||||
ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
|
||||
cb(kv_cmpr, "kv_cmpr_reshape", il);
|
||||
|
||||
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
|
||||
ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
|
||||
ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
// {kv_lora_rank, 1, n_tokens}
|
||||
|
|
@ -144,8 +146,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||
cur = build_attn(inp_attn,
|
||||
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
|
||||
cur = build_attn(inp_attn_k,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
||||
} else {
|
||||
|
|
@ -169,11 +171,10 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
Vcur = ggml_cont(ctx0, Vcur);
|
||||
cb(Vcur, "Vcur_cont", il);
|
||||
|
||||
// note: rope must go first for in-place context shifting in build_rope_shift()
|
||||
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
|
||||
ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
||||
ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
if (inp_attn_scale) {
|
||||
|
|
@ -183,12 +184,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
}
|
||||
|
||||
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
||||
cur = build_attn(inp_attn,
|
||||
cur = build_attn(inp_attn_kv,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
}
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
if (il == effective_n_layers - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
|
@ -215,9 +216,11 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
il,
|
||||
nullptr,
|
||||
model.layers[il].ffn_gate_up_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// FFN shared expert
|
||||
|
|
|
|||
|
|
@ -0,0 +1,445 @@
|
|||
#include "models.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
// utility to get one slice from the third dimension
|
||||
// input dim: [x, y, c, b]
|
||||
// output dim: [x, y, 1, b]
|
||||
static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
|
||||
return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
|
||||
t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
|
||||
}
|
||||
|
||||
llm_build_delta_net_base::llm_build_delta_net_base(const llm_graph_params & params) : llm_graph_context(params) {}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_chunking(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
const bool kda = (g->ne[0] == S_k && g->ne[1] == H_k);
|
||||
|
||||
GGML_ASSERT(S_k == S_v);
|
||||
GGML_ASSERT(H_v % H_k == 0);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_k);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(b, "b_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
|
||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
|
||||
v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
|
||||
g = ggml_permute(ctx0, g, 0, 2, 1, 3); // [g_0, n_tokens, H_v, n_seqs]
|
||||
b = ggml_permute(ctx0, b, 0, 2, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
|
||||
|
||||
const int CS = kda ? 16 : 64; // chunk size
|
||||
|
||||
const int pad = (CS - n_tokens % CS) % CS;
|
||||
const int n_chunks = (n_tokens + pad) / CS;
|
||||
|
||||
q = ggml_pad(ctx0, q, 0, pad, 0, 0);
|
||||
k = ggml_pad(ctx0, k, 0, pad, 0, 0);
|
||||
v = ggml_pad(ctx0, v, 0, pad, 0, 0);
|
||||
g = ggml_pad(ctx0, g, 0, pad, 0, 0);
|
||||
b = ggml_pad(ctx0, b, 0, pad, 0, 0);
|
||||
|
||||
ggml_tensor * v_b = ggml_mul(ctx0, v, b);
|
||||
ggml_tensor * k_b = ggml_mul(ctx0, k, b);
|
||||
|
||||
cb(v_b, "v_b", il);
|
||||
cb(k_b, "k_b", il);
|
||||
|
||||
q = ggml_reshape_4d(ctx0, q, S_k, CS, n_chunks, H_k * n_seqs);
|
||||
k = ggml_reshape_4d(ctx0, k, S_k, CS, n_chunks, H_k * n_seqs);
|
||||
k_b = ggml_reshape_4d(ctx0, k_b, S_k, CS, n_chunks, H_v * n_seqs);
|
||||
v = ggml_reshape_4d(ctx0, v, S_v, CS, n_chunks, H_v * n_seqs);
|
||||
v_b = ggml_reshape_4d(ctx0, v_b, S_v, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
g = ggml_reshape_4d(ctx0, g, g->ne[0], CS, n_chunks, H_v * n_seqs);
|
||||
b = ggml_reshape_4d(ctx0, b, 1, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
// [CS, g_0, n_chunks, H_v * n_seqs]
|
||||
// TODO: extend ggml_cumsum with axis parameter to avoid transpose
|
||||
ggml_tensor * g_cs = ggml_cumsum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, g)));
|
||||
cb(g_cs, "g_cs", il);
|
||||
|
||||
ggml_tensor * kb = nullptr;
|
||||
ggml_tensor * kq = nullptr;
|
||||
if (kda) {
|
||||
const int64_t CHB = n_chunks * H_k * n_seqs;
|
||||
|
||||
ggml_tensor * g_cs_i = ggml_reshape_4d(ctx0, g_cs, CS, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB]
|
||||
ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, S_k, CHB); // [1, chunk_size, S_k, CHB]
|
||||
|
||||
g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB]
|
||||
|
||||
// decay_mask [chunk_size,chunk_size,S_k,CHB]
|
||||
ggml_tensor * decay_mask;
|
||||
decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
|
||||
decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
|
||||
// decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched
|
||||
decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, CS, CS, CHB);
|
||||
|
||||
ggml_tensor * k_b_i = ggml_reshape_4d(ctx0, k_b, S_k, CS, 1, CHB);
|
||||
ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, CS, CHB);
|
||||
ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, CS, 1, CHB);
|
||||
|
||||
ggml_tensor * decay_k_b_i = ggml_mul(ctx0, decay_mask, k_b_i);
|
||||
ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i);
|
||||
|
||||
// decay_k_b_i [S,BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB]
|
||||
kb = ggml_mul_mat(ctx0, decay_k_b_i, k_j);
|
||||
kq = ggml_mul_mat(ctx0, decay_q_i, k_j);
|
||||
|
||||
kb = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, kb, CS, CS, n_chunks, H_v * n_seqs)));
|
||||
kq = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, kq, CS, CS, n_chunks, H_v * n_seqs)));
|
||||
} else {
|
||||
ggml_tensor * g_cs_i = g_cs;
|
||||
ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, n_chunks, H_v * n_seqs);
|
||||
|
||||
// [CS, CS, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * decay_mask;
|
||||
decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
|
||||
decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||
cb(decay_mask, "decay_mask", il);
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
kb = ggml_mul_mat(ctx0, k, k_b);
|
||||
kb = ggml_mul (ctx0, kb, decay_mask);
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
kq = ggml_mul_mat(ctx0, k, q);
|
||||
kq = ggml_mul(ctx0, kq, decay_mask);
|
||||
}
|
||||
|
||||
kq = ggml_tri(ctx0, kq, GGML_TRI_TYPE_LOWER_DIAG);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
// [CS, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * attn;
|
||||
attn = ggml_tri(ctx0, kb, GGML_TRI_TYPE_LOWER);
|
||||
cb(attn, "attn", il);
|
||||
|
||||
ggml_tensor * identity;
|
||||
identity = ggml_view_1d(ctx0, attn, CS, 0);
|
||||
identity = ggml_fill (ctx0, identity, 1.0f);
|
||||
identity = ggml_diag (ctx0, identity);
|
||||
|
||||
ggml_tensor * lhs = ggml_add(ctx0, attn, identity);
|
||||
cb(lhs, "dnet_add_ch_lhs", il);
|
||||
|
||||
attn = ggml_neg(ctx0, attn);
|
||||
cb(attn, "attn_pre_solve", il);
|
||||
|
||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||
attn = ggml_add(ctx0, lin_solve, identity);
|
||||
cb(attn, "dnet_add_ch_attn_solved", il); // [CS, CS, n_chunks, H_k * n_seqs]
|
||||
|
||||
// [S_v, CS, n_chunks, H_v * n_seqs]
|
||||
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_b)), attn);
|
||||
|
||||
// [CS, 1, n_chunks, H_v * n_seqs] KDA: [CS, S_k, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_exp = ggml_exp(ctx0, g_cs);
|
||||
|
||||
k_b = ggml_cont(ctx0, ggml_transpose(ctx0, k_b));
|
||||
|
||||
// [CS, S_k, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * kbg = ggml_mul(ctx0, k_b, g_exp);
|
||||
cb(kbg, "k_beta_g_exp", il);
|
||||
|
||||
// [S_k, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * k_cd = ggml_mul_mat(ctx0, kbg, attn);
|
||||
cb(k_cd, "k_cumdecay", il);
|
||||
|
||||
// [1, CS, n_chunks, H_k * n_seqs] KDA: [S_k, CS, n_chunks, H_k * n_seqs]
|
||||
ggml_tensor * g_exp_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_exp));
|
||||
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, g_exp_t);
|
||||
|
||||
// vectorized calculation of key_gdiff
|
||||
// improved from the chunked version:
|
||||
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
|
||||
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
|
||||
// key_gdiff = key * g_diff.unsqueeze(-1)
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
|
||||
// get last element in g_cumsum along CS dimension (ne0)
|
||||
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
|
||||
// [1, 1, n_chunks, H_v * n_seqs] KDA: [1, S_k, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cs, 1, g_cs->ne[1], g_cs->ne[2], g_cs->ne[3],
|
||||
g_cs->nb[1],
|
||||
g_cs->nb[2],
|
||||
g_cs->nb[3],
|
||||
ggml_row_size(g_cs->type, g_cs->ne[0] - 1));
|
||||
cb(g_last, "g_last", il);
|
||||
|
||||
// TODO: remove this cont when CUDA supports non-cont unary ops
|
||||
g_last = ggml_cont(ctx0, g_last);
|
||||
|
||||
// [1, 1, n_chunks, H_v * n_seqs] KDA: [S_k, 1, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_last_exp_t = ggml_transpose(ctx0, ggml_exp(ctx0, g_last));
|
||||
cb(g_last_exp_t, "g_last_exp_t", il);
|
||||
|
||||
// [CS, 1, n_chunks, H_v * n_seqs] KDA: [CS, S_k, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cs, g_last));
|
||||
cb(g_diff, "g_diff", il);
|
||||
|
||||
ggml_tensor * g_diff_exp_t = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_exp(ctx0, g_diff)));
|
||||
|
||||
// [S_k, CS, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * kg = ggml_mul(ctx0, k, g_diff_exp_t);
|
||||
cb(kg, "key_gdiff", il);
|
||||
|
||||
// [CS, S_k, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
|
||||
cb(kg_t, "key_gdiff_t", il);
|
||||
|
||||
s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
|
||||
cb(s, "dnet_add_ch_state", il);
|
||||
|
||||
// [CS, S_v, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
|
||||
|
||||
for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
|
||||
ggml_tensor * ch_k_cd = get_slice_2d(ctx0, k_cd, chunk); // [S_k, CS, 1, H_k * n_seqs]
|
||||
ggml_tensor * ch_v_t = get_slice_2d(ctx0, v_t, chunk); // [ CS, S_v, 1, H_v * n_seqs]
|
||||
ggml_tensor * ch_kq = get_slice_2d(ctx0, kq, chunk); // [ CS, CS, 1, H_k * n_seqs]
|
||||
ggml_tensor * ch_q_g_exp = get_slice_2d(ctx0, q_g_exp, chunk); // [S_k, CS, 1, H_k * n_seqs]
|
||||
ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
|
||||
|
||||
// [CS, S_v, 1, H_v * n_seqs]
|
||||
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
|
||||
cb(v_t_p, "v_prime", il);
|
||||
|
||||
// [CS, S_v, 1, H_v * n_seqs]
|
||||
ggml_tensor * v_t_new = ggml_sub(ctx0, ch_v_t, v_t_p);
|
||||
cb(v_t_new, "v_t_new", il);
|
||||
|
||||
// [S_v, CS, 1, H_v * n_seqs]
|
||||
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_t_new, ch_kq);
|
||||
cb(v_attn, "v_attn", il);
|
||||
|
||||
// [S_v, CS, 1, H_v * n_seqs]
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
|
||||
cb(attn_inter, "attn_inter", il);
|
||||
|
||||
// [S_v, CS, 1, H_v * n_seqs]
|
||||
ggml_tensor * o_ch = ggml_add(ctx0, attn_inter, v_attn);
|
||||
cb(o_ch, "dnet_add_ch_attn_out", il);
|
||||
|
||||
v = ggml_set_inplace(ctx0, v, o_ch, v->nb[1], v->nb[2], v->nb[3], chunk * v->nb[2]);
|
||||
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
// TODO: head broadcast might not work here - probably will need a transpose
|
||||
ggml_tensor * kgv = ggml_mul_mat(ctx0, ch_kg_t, v_t_new); // [S_k, S_v, 1, H_k * n_seqs]
|
||||
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
|
||||
|
||||
s = ggml_mul(ctx0, s, ch_g_last_exp_t);
|
||||
s = ggml_add(ctx0, s, kgv);
|
||||
cb(s, "dnet_add_ch_state", il);
|
||||
}
|
||||
|
||||
// truncate padded tokens
|
||||
ggml_tensor * o = ggml_view_4d(ctx0, v,
|
||||
S_v, n_tokens, H_v, n_seqs,
|
||||
ggml_row_size(v->type, S_v),
|
||||
ggml_row_size(v->type, S_v * CS * n_chunks),
|
||||
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
|
||||
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
|
||||
s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
|
||||
cb(s, "output_state", il);
|
||||
|
||||
return {o, s};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_autoregressive(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b, // beta
|
||||
ggml_tensor * s, // state
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(n_tokens == 1);
|
||||
|
||||
GGML_ASSERT(S_k == S_v);
|
||||
GGML_ASSERT(H_v % H_k == 0);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
const float scale = 1.0f / sqrtf(S_k);
|
||||
|
||||
q = ggml_scale(ctx0, q, scale);
|
||||
|
||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
|
||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
|
||||
v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
|
||||
|
||||
cb(q, "q_in", il);
|
||||
cb(k, "k_in", il);
|
||||
cb(v, "v_in", il);
|
||||
cb(b, "b_in", il);
|
||||
cb(g, "g_in", il);
|
||||
|
||||
// GDA: [1, 1, H_v, n_seqs]
|
||||
// KDA: [1, S_k, H_v, n_seqs]
|
||||
g = ggml_reshape_4d(ctx0, g, 1, g->ne[0], H_v, n_seqs);
|
||||
b = ggml_reshape_4d(ctx0, b, 1, 1, H_v, n_seqs);
|
||||
|
||||
// [S_v, S_v, H_v, n_seqs]
|
||||
g = ggml_exp(ctx0, g);
|
||||
s = ggml_mul(ctx0, s, g);
|
||||
|
||||
// [1, S_v, H_v, n_seqs]
|
||||
ggml_tensor * sk;
|
||||
sk = ggml_mul (ctx0, s, k);
|
||||
sk = ggml_sum_rows(ctx0, sk);
|
||||
|
||||
// [S_v, 1, H_v, n_seqs]
|
||||
ggml_tensor * d;
|
||||
d = ggml_sub(ctx0, v, ggml_transpose(ctx0, sk));
|
||||
d = ggml_mul(ctx0, d, b);
|
||||
|
||||
// [1, S_v, H_v, n_seqs]
|
||||
ggml_tensor * d_t;
|
||||
d_t = ggml_transpose(ctx0, d);
|
||||
|
||||
// [S_v, S_v, H_v, n_seqs]
|
||||
ggml_tensor * kd;
|
||||
k = ggml_repeat(ctx0, k, s);
|
||||
kd = ggml_mul (ctx0, k, d_t);
|
||||
|
||||
s = ggml_add(ctx0, s, kd);
|
||||
|
||||
cb(s, "dnet_add_ar_state", il);
|
||||
|
||||
ggml_tensor * s_q = ggml_mul (ctx0, s, q);
|
||||
ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
|
||||
|
||||
o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
|
||||
|
||||
return {o, s};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_fused(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(S_k == S_v);
|
||||
GGML_ASSERT(H_v % H_k == 0);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
|
||||
if (n_tokens == 1) {
|
||||
cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
} else {
|
||||
cb(result, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
}
|
||||
|
||||
ggml_tensor * output = ggml_view_4d(ctx0, result,
|
||||
S_v, H_v, n_tokens, n_seqs,
|
||||
ggml_row_size(result->type, S_v),
|
||||
ggml_row_size(result->type, S_v * H_v),
|
||||
ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
|
||||
|
||||
ggml_tensor * new_state = ggml_view_4d(ctx0, result,
|
||||
S_v, S_v, H_v, n_seqs,
|
||||
ggml_row_size(result->type, S_v),
|
||||
ggml_row_size(result->type, S_v * S_v),
|
||||
ggml_row_size(result->type, S_v * S_v * H_v),
|
||||
ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
|
||||
|
||||
return {output, new_state};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il) {
|
||||
const int64_t n_seq_tokens = q->ne[2];
|
||||
|
||||
if (n_seq_tokens == 1) {
|
||||
if (cparams.fused_gdn_ar) {
|
||||
return build_delta_net_fused(q, k, v, g, b, s, il);
|
||||
}
|
||||
return build_delta_net_autoregressive(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
||||
if (cparams.fused_gdn_ch) {
|
||||
return build_delta_net_fused(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
||||
return build_delta_net_chunking(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
|
@ -1,13 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -91,7 +89,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue