diff --git a/.devcontainer/CPU/Dockerfile b/.devcontainer/CPU/Dockerfile new file mode 100644 index 0000000..3f47760 --- /dev/null +++ b/.devcontainer/CPU/Dockerfile @@ -0,0 +1,18 @@ +FROM mcr.microsoft.com/devcontainers/cpp:1-ubuntu-20.04 + +ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9" + +# Optionally install the cmake for vcpkg +COPY scripts/packages-install/reinstall-cmake.sh /tmp/ + +RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \ + chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \ + fi \ + && rm -f /tmp/reinstall-cmake.sh + +# [Optional] Uncomment this section to install additional vcpkg ports. +# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install " + +# [Optional] Uncomment this section to install additional packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends diff --git a/.devcontainer/CPU/devcontainer.json b/.devcontainer/CPU/devcontainer.json new file mode 100644 index 0000000..4c1d146 --- /dev/null +++ b/.devcontainer/CPU/devcontainer.json @@ -0,0 +1,33 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/cpp +{ + "name": "C++", + "build": { + "context": "../..", + "dockerfile": "Dockerfile" + }, + + "customizations": { + "vscode": { + "extensions": [ + "ms-vscode.cpptools-extension-pack", + "ms-vscode.cmake-tools" + ] + } + } + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "gcc -v", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.devcontainer/CUDA/Dockerfile b/.devcontainer/CUDA/Dockerfile new file mode 100644 index 0000000..d7df2de --- /dev/null +++ b/.devcontainer/CUDA/Dockerfile @@ -0,0 +1,24 @@ +FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubuntu20.04 + +RUN apt-get update +RUN apt-get upgrade -y +RUN apt-get install curl -y + +ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9" + +# Optionally install the cmake for vcpkg +COPY scripts/packages-install/reinstall-cmake.sh /tmp/ + +RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \ + chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \ + fi \ + && rm -f /tmp/reinstall-cmake.sh + + + +# [Optional] Uncomment this section to install additional vcpkg ports. +# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install " + +# [Optional] Uncomment this section to install additional packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends diff --git a/.devcontainer/CUDA/devcontainer.json b/.devcontainer/CUDA/devcontainer.json new file mode 100644 index 0000000..a59e314 --- /dev/null +++ b/.devcontainer/CUDA/devcontainer.json @@ -0,0 +1,43 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/cpp +{ + "name": "CUDA", + "build": { + "context": "../..", + "dockerfile": "Dockerfile" + }, + + "runArgs": [ + "--gpus", + "all" + ], + + "customizations": { + "vscode": { + "extensions": [ + "ms-vscode.cpptools-extension-pack", + "ms-vscode.cmake-tools" + ] + } + }, + + "hostRequirements": { + "gpu": "optional" + }, + + "features": { + "ghcr.io/devcontainers/features/git:1": {} + } + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "gcc -v", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.devcontainer/ROCm/Dockerfile b/.devcontainer/ROCm/Dockerfile new file mode 100644 index 0000000..ee8c283 --- /dev/null +++ b/.devcontainer/ROCm/Dockerfile @@ -0,0 +1,24 @@ +FROM rocm/dev-ubuntu-20.04:latest + +RUN apt-get update +RUN apt-get upgrade -y +RUN apt-get install curl -y + +ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9" + +# Optionally install the cmake for vcpkg +COPY scripts/packages-install/reinstall-cmake.sh /tmp/ + +RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \ + chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \ + fi \ + && rm -f /tmp/reinstall-cmake.sh + + + +# [Optional] Uncomment this section to install additional vcpkg ports. +# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install " + +# [Optional] Uncomment this section to install additional packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends diff --git a/.devcontainer/ROCm/devcontainer.json b/.devcontainer/ROCm/devcontainer.json new file mode 100644 index 0000000..302e224 --- /dev/null +++ b/.devcontainer/ROCm/devcontainer.json @@ -0,0 +1,43 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/cpp +{ + "name": "ROCm", + "build": { + "context": "../..", + "dockerfile": "Dockerfile" + }, + + "runArgs": [ + "--gpus", + "all" + ], + + "customizations": { + "vscode": { + "extensions": [ + "ms-vscode.cpptools-extension-pack", + "ms-vscode.cmake-tools" + ] + } + }, + + "hostRequirements": { + "gpu": "optional" + }, + + "features": { + "ghcr.io/devcontainers/features/git:1": {} + } + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "gcc -v", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index 08d2ac4..0000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "image": "mcr.microsoft.com/devcontainers/universal:2", - "features": { - "ghcr.io/devcontainers/features/nvidia-cuda:1": { - "instalNvtx": true, - "installToolkit": true, - "cudaVersion": "12.3" - } - } -} diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3805f8a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +build/ +lib/ +bin/ +include/ +build-temp/ +.git/ +.vscode/ diff --git a/.github/workflows/cmake-single-platform.yml b/.github/workflows/cmake-multi-platform.yml similarity index 55% rename from .github/workflows/cmake-single-platform.yml rename to .github/workflows/cmake-multi-platform.yml index e69bac7..7347fe3 100644 --- a/.github/workflows/cmake-single-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -1,6 +1,6 @@ # This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage. # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml -name: CMake on a single platform +name: CMake on multi platform on: push: @@ -17,23 +17,52 @@ jobs: # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac. # You can convert this to a matrix build if you need cross-platform coverage. # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix - runs-on: ubuntu-latest strategy: fail-fast: false matrix: - cuda-version: [ "12.3" ] + os: [ "ubuntu-22.04", "ubuntu-22.04-arm" ] + cuda-version: [ "12.9", "12.4", "11.8" ] + rocm-enabled: [ false ] + include: + - os: "ubuntu-22.04" + cuda-version: "11.8" + rocm-enabled: true + rocm-version: "6.4.2" + - os: "windows-latest" + cuda-version: "12.4.1" + + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 + + - name: Run ROCm bash shell Ubuntu/Debian + if: (startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'debian')) && matrix.rocm-enabled + env: + temp: ${{ runner.temp }} + rocm: ${{ matrix.rocm-version }} + run: | + chmod +x ${{github.workspace}}/scripts/actions/install-rocm-ubuntu.sh + ${{github.workspace}}/scripts/actions/install-rocm-ubuntu.sh + shell: bash - - name: Run CUDA bash shell + - name: Run CUDA bash shell Ubuntu/Debian + if: startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'debian') env: temp: ${{ runner.temp }} cuda: ${{ matrix.cuda-version }} run: | - chmod +x ./scripts/actions/install-cuda-ubuntu.sh - ./scripts/actions/install-cuda-ubuntu.sh + chmod +x ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh + ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh shell: bash + - name: Run CUDA bash shell Windows + if: runner.os == 'Windows' + env: + temp: ${{ runner.temp }} + cuda: ${{ matrix.cuda-version }} + run: scripts/actions/install-cuda-windows.ps1 + shell: pwsh + - name: Configure CMake # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type @@ -44,7 +73,8 @@ jobs: run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} - name: Test - working-directory: ${{github.workspace}}/build + working-directory: ${{github.workspace}}/ + if: runner.os != 'Windows' # Execute tests defined by the CMake configuration. # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail run: ctest -C ${{env.BUILD_TYPE}} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 89997e7..601cc8d 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,7 +27,7 @@ jobs: # - https://gh.io/supported-runners-and-hardware-resources # - https://gh.io/using-larger-runners # Consider using larger runners for possible analysis time improvements. - runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-22.04' }} timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} permissions: actions: read @@ -38,7 +38,7 @@ jobs: fail-fast: false matrix: language: [ 'c-cpp' ] - cuda-version: [ "12.3" ] + cuda-version: [ "12.9" ] # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ] # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both @@ -53,8 +53,8 @@ jobs: temp: ${{ runner.temp }} cuda: ${{ matrix.cuda-version }} run: | - chmod +x ./scripts/actions/install-cuda-ubuntu.sh - ./scripts/actions/install-cuda-ubuntu.sh + chmod +x ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh + ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh shell: bash # Initializes the CodeQL tools for scanning. diff --git a/.github/workflows/deploy-webpages.yml b/.github/workflows/deploy-webpages.yml new file mode 100644 index 0000000..3edbb5e --- /dev/null +++ b/.github/workflows/deploy-webpages.yml @@ -0,0 +1,88 @@ +# This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage. +# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml +name: CMake on a single platform (deploy web pages) + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + workflow_dispatch: + +env: + # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) + BUILD_TYPE: Release + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + build: + # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac. + # You can convert this to a matrix build if you need cross-platform coverage. + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - name: Setup Pages + uses: actions/configure-pages@v5 + + - name: Run CUDA bash shell Ubuntu/Debian + env: + temp: ${{ runner.temp }} + cuda: "12.9" + run: | + chmod +x ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh + ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh + shell: bash + + - name: Install Doxygen + run: | + sudo apt-get update + sudo apt-get install doxygen + shell: bash + + - name: Configure CMake + # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. + # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type + run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} + + - name: Build + # Build your program with the given configuration + run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} + + - name: Test + working-directory: ${{github.workspace}}/build + # Execute tests defined by the CMake configuration. + # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail + run: ctest -C ${{env.BUILD_TYPE}} + + - name: Upload GitHub Pages artifact + uses: actions/upload-pages-artifact@v3 + with: + # Path of the directory containing the static assets. + path: "build/html/" # default is _site/ + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 + diff --git a/.github/workflows/docker-publish-d.yml b/.github/workflows/docker-publish-d.yml new file mode 100644 index 0000000..2a6ead5 --- /dev/null +++ b/.github/workflows/docker-publish-d.yml @@ -0,0 +1,52 @@ +name: Publish Docker image + +on: + release: + type: [published] + +jobs: + + push_to_registry: + name: Push Docker image + runs-on: ubuntu-22.04 + strategy: + matrix: + image-tag: [ "CUDA-12.9.1-Ubuntu-2004", "CUDA-12.9.1-ubi8" ] + permissions: + packages: write + contents: read + attestations: write + id-token: write + steps: + - name: Check out the repo + uses: actions/checkout@v4 + + - name: Docker Login + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Docker Metadata action + id: meta + uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 + with: + image: ${{ secrets.DOCKER_USERNAME }}/tensor-array + tags: ${{ matrix.image-tag }} + + - name: Build and push Docker images + id: push + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 + with: + context: . + file: ./Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + - name: Attest Build Provenance + uses: actions/attest-build-provenance@v2 + with: + subject-name: index.docker.io/${{ secrets.DOCKER_USERNAME }}/tensor-array + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..05715a9 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,105 @@ +name: Docker + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +on: + schedule: + - cron: '32 2 * * *' + push: + branches: [ "master" ] + # Publish semver tags as releases. + tags: [ 'v*.*.*' ] + pull_request: + branches: [ "master" ] + +env: + # Use docker.io for Docker Hub if empty + REGISTRY: ghcr.io + # github.repository as / + IMAGE_NAME: ${{ github.repository }} + + +jobs: + build: + strategy: + fail-fast: false + matrix: + image-tag: [ "CUDA-12.9.1-Ubuntu-20.04", "CUDA-12.9.1-ubi8" ] + + runs-on: ubuntu-22.04 + + permissions: + contents: read + packages: write + # This is used to complete the identity challenge + # with sigstore/fulcio when running outside of PRs. + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Install the cosign tool except on PR + # https://github.com/sigstore/cosign-installer + - name: Install cosign + if: github.event_name != 'pull_request' + uses: sigstore/cosign-installer@59acb6260d9c0ba8f4a2f9d9b48431a222b68e20 #v3.5.0 + with: + cosign-release: 'v2.2.4' + + # Set up BuildKit Docker container builder to be able to build + # multi-platform images and export cache + # https://github.com/docker/setup-buildx-action + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@f95db51fddba0c2d1ec667646a06c2ce06100226 # v3.0.0 + + # Login against a Docker registry except on PR + # https://github.com/docker/login-action + - name: Log into registry ${{ env.REGISTRY }} + if: github.event_name != 'pull_request' + uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Extract metadata (tags, labels) for Docker + # https://github.com/docker/metadata-action + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # v5.0.0 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=raw,value=${{ matrix.image-tag }} + + # Build and push Docker image with Buildx (don't push on PR) + # https://github.com/docker/build-push-action + - name: Build and push Docker image + id: build-and-push + uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0 + with: + file: Dockerfolder/${{ matrix.image-tag }}.Dockerfile + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + # Sign the resulting Docker image digest except on PRs. + # This will only write to the public Rekor transparency log when the Docker + # repository is public to avoid leaking data. If you would like to publish + # transparency data even for private images, pass --force to cosign below. + # https://github.com/sigstore/cosign + - name: Sign the published Docker image + if: ${{ github.event_name != 'pull_request' }} + env: + # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable + TAGS: ${{ steps.meta.outputs.tags }} + DIGEST: ${{ steps.build-and-push.outputs.digest }} + # This step uses the identity token to provision an ephemeral certificate + # against the sigstore community Fulcio instance. + run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} diff --git a/.github/workflows/msvc.yml b/.github/workflows/msvc.yml new file mode 100644 index 0000000..d4f9072 --- /dev/null +++ b/.github/workflows/msvc.yml @@ -0,0 +1,74 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. +# +# Find more information at: +# https://github.com/microsoft/msvc-code-analysis-action + +name: Microsoft C++ Code Analysis + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + schedule: + - cron: '37 5 * * 4' + +env: + # Path to the CMake build directory. + build: '${{ github.workspace }}/build' + +permissions: + contents: read + +jobs: + analyze: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: Analyze + runs-on: windows-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run CUDA bash shell Windows + env: + temp: ${{ runner.temp }} + cuda: "12.4.1" + run: scripts/actions/install-cuda-windows.ps1 + shell: pwsh + + + - name: Configure CMake + run: cmake -B ${{ env.build }} + + # Build is not required unless generated source files are used + # - name: Build CMake + # run: cmake --build ${{ env.build }} + + - name: Initialize MSVC Code Analysis + uses: microsoft/msvc-code-analysis-action@04825f6d9e00f87422d6bf04e1a38b1f3ed60d99 + # Provide a unique ID to access the sarif output path + id: run-analysis + with: + cmakeBuildDirectory: ${{ env.build }} + # Ruleset file that will determine what checks will be run + ruleset: NativeRecommendedRules.ruleset + + # Upload SARIF file to GitHub Code Scanning Alerts + - name: Upload SARIF to GitHub + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: ${{ steps.run-analysis.outputs.sarif }} + + # Upload SARIF file as an Artifact to download and view + # - name: Upload SARIF as an Artifact + # uses: actions/upload-artifact@v4 + # with: + # name: sarif-file + # path: ${{ steps.run-analysis.outputs.sarif }} diff --git a/.gitignore b/.gitignore index bc78b51..f4f2205 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ lib/ bin/ include/ build-temp/ +.vs/ +out/ +CMakeSettings.json diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..4bdb4f5 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,50 @@ +# This file is a template, and might need editing before it works on your project. +# This is a sample GitLab CI/CD configuration file that should run without any modifications. +# It demonstrates a basic 3 stage CI/CD pipeline. Instead of real tests or scripts, +# it uses echo commands to simulate the pipeline execution. +# +# A pipeline is composed of independent jobs that run scripts, grouped into stages. +# Stages run in sequential order, but jobs within stages run in parallel. +# +# For more information, see: https://docs.gitlab.com/ee/ci/yaml/#stages +# +# You can copy and paste this template into a new `.gitlab-ci.yml` file. +# You should not add this template to an existing `.gitlab-ci.yml` file by using the `include:` keyword. +# +# To contribute improvements to CI/CD templates, please follow the Development guide at: +# https://docs.gitlab.com/development/cicd/templates/ +# This specific template is located at: +# https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/gitlab/ci/templates/Getting-Started.gitlab-ci.yml + +stages: # List of stages for jobs, and their order of execution + - build + - test + - deploy + +build-job: # This job runs in the build stage, which runs first. + stage: build + script: + - echo "Compiling the code..." + - echo "Compile complete." + +unit-test-job: # This job runs in the test stage. + stage: test # It only starts when the job in the build stage completes successfully. + script: + - echo "Running unit tests... This will take about 60 seconds." + - sleep 60 + - echo "Code coverage is 90%" + +lint-test-job: # This job also runs in the test stage. + stage: test # It can run at the same time as unit-test-job (in parallel). + script: + - echo "Linting code... This will take about 10 seconds." + - sleep 10 + - echo "No lint issues found." + +deploy-job: # This job runs in the deploy stage. + stage: deploy # It only runs when *both* jobs in the test stage complete successfully. + environment: production + script: + - echo "Deploying application..." + - echo "Application successfully deployed." + diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json index d224842..84bab62 100644 --- a/.vscode/c_cpp_properties.json +++ b/.vscode/c_cpp_properties.json @@ -6,7 +6,8 @@ "${workspaceFolder}/src" ], "defines": [], - "compilerPath": "/usr/bin/g++" + "compilerPath": "/usr/bin/g++", + "configurationProvider": "ms-vscode.cmake-tools" }, { "name": "Windows", diff --git a/.vscode/settings.json b/.vscode/settings.json index 821aaac..bbbdf48 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -59,6 +59,13 @@ "stop_token": "cpp", "streambuf": "cpp", "thread": "cpp", - "typeindex": "cpp" + "typeindex": "cpp", + "cassert": "cpp", + "stack": "cpp", + "*.h": "c", + "map": "cpp", + "optional": "cpp", + "fstream": "cpp", + "set": "cpp" } } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 652e3b9..c698055 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,30 +1,38 @@ -cmake_minimum_required(VERSION 3.8.0) +cmake_minimum_required(VERSION 3.18) -project(TensorArray) +project(TensorArray C CXX) include(GNUInstallDirs) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/build-temp/archive) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/build-temp/library) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/build-temp/runtime) -set(CMAKE_OUTPUT ${CMAKE_CURRENT_LIST_DIR}/build-temp/bin) -set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}) +# set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}) -add_subdirectory("src/tensor_array/core") -add_subdirectory("src/tensor_array/layers") +include(cmake/ta_core_config.cmake) +include(cmake/ta_layers_config.cmake) +include(cmake/ta_interp_config.cmake) + +# add_subdirectory("src/tensor-array/core") +# add_subdirectory("src/tensor-array/layers") +# add_subdirectory("src/tensor-array/interp") + +include(CTest) +if(BUILD_TESTING) + add_subdirectory("tests/tensor-array/core") +endif() + +include(cmake/ta_add_doxygen.cmake) set(CPACK_PACKAGE_NAME "TensorArray") set(CPACK_PACKAGE_VENDOR "TensorArray-Creators") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY) -set(CPACK_PACKAGE_VERSION "0.1.0") +set(CPACK_PACKAGE_VERSION "0.2.0") set(CPACK_PACKAGE_VERSION_MAJOR "0") -set(CPACK_PACKAGE_VERSION_MINOR "1") +set(CPACK_PACKAGE_VERSION_MINOR "2") set(CPACK_PACKAGE_VERSION_PATCH "0") set(CPACK_PACKAGE_INSTALL_DIRECTORY "A machine learning libraries") install( EXPORT TensorArrayTargets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/TensorArray + DESTINATION ${CMAKE_INSTALL_LIBDIR}/tensor-array/cmake NAMESPACE TensorArray:: FILE TensorArrayTargets.cmake ) @@ -33,13 +41,13 @@ include(CMakePackageConfigHelpers) configure_package_config_file( "Config.cmake.in" "TensorArrayConfig.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/TensorArray + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/tensor-array/cmake PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR ) write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/TensorArrayConfigVersion.cmake - VERSION 0.1.0 + VERSION 0.2.0 COMPATIBILITY SameMajorVersion ) @@ -48,7 +56,7 @@ install( FILES ${CMAKE_CURRENT_BINARY_DIR}/TensorArrayConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/TensorArrayConfigVersion.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/TensorArray + DESTINATION ${CMAKE_INSTALL_LIBDIR}/tensor-array/cmake ) include(CPack) diff --git a/Dockerfolder/CUDA-12.9.1-Ubuntu-20.04.Dockerfile b/Dockerfolder/CUDA-12.9.1-Ubuntu-20.04.Dockerfile new file mode 100644 index 0000000..e8226e3 --- /dev/null +++ b/Dockerfolder/CUDA-12.9.1-Ubuntu-20.04.Dockerfile @@ -0,0 +1,35 @@ +FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubuntu20.04 + +RUN apt-get update +RUN apt-get upgrade -y +RUN apt-get install curl -y + +ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9" + +# Optionally install the cmake for vcpkg +COPY scripts/packages-install/reinstall-cmake-ubuntu.sh /tmp/ + +RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \ + chmod +x /tmp/reinstall-cmake-ubuntu.sh && /tmp/reinstall-cmake-ubuntu.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \ + fi \ + && rm -f /tmp/reinstall-cmake-ubuntu.sh + + +# [Optional] Uncomment this section to install additional vcpkg ports. +# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install " + +# [Optional] Uncomment this section to install additional packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends + +WORKDIR /main-project +COPY ./ tensor-array/ + +WORKDIR tensor-array/build + +RUN cmake .. +RUN cmake --build . +RUN cmake --install . +# RUN ctest + +WORKDIR .. diff --git a/Dockerfolder/CUDA-12.9.1-ubi8.Dockerfile b/Dockerfolder/CUDA-12.9.1-ubi8.Dockerfile new file mode 100644 index 0000000..5bbd1ec --- /dev/null +++ b/Dockerfolder/CUDA-12.9.1-ubi8.Dockerfile @@ -0,0 +1,35 @@ +FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubi8 + +RUN dnf update -y +RUN dnf upgrade -y +RUN dnf install curl -y + +ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9" + +# Optionally install the cmake for vcpkg +COPY scripts/packages-install/reinstall-cmake-rhel.sh /tmp/ + +RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \ + chmod +x /tmp/reinstall-cmake-rhel.sh && /tmp/reinstall-cmake-rhel.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \ + fi \ + && rm -f /tmp/reinstall-cmake-rhel.sh + + +# [Optional] Uncomment this section to install additional vcpkg ports. +# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install " + +# [Optional] Uncomment this section to install additional packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends + +WORKDIR /main-project +COPY ./ tensor-array/ + +WORKDIR tensor-array/build + +RUN cmake .. +RUN cmake --build . +RUN cmake --install . +# RUN ctest + +WORKDIR .. diff --git a/README.md b/README.md index c845edb..0ee20a1 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,39 @@ # Tensor-Array ![C++](https://img.shields.io/badge/C%2B%2B-17-blue) +[![Docker Image Size with architecture (latest by date/latest semver)](https://img.shields.io/docker/image-size/noobwastaken/tensor-array) +](https://hub.docker.com/repository/docker/noobwastaken/tensor-array/general) + A C++ Tensor library that can be used to work with machine learning or deep learning project. Build your own neural network models with this library. +## Installing `Tensor-Array` + +You need to clone repository by using [Git](https://git-scm.com/) +You need to install `Tensor-Array` with [CMake](https://cmake.org/) + +```shell +git clone https://github.com/Tensor-Array/Tensor-Array.git +cd Tensor-Array +mkdir build +cd build +cmake .. +cmake --build . +cmake --install . +cd .. +``` ## Why this repository named `Tensor-Array` We created a template struct that named `TensorArray`. That struct is a multi-dimensional array wrapper. ```C++ -#include "tensor_array/core/tensorbase.hh" +#include -using tensor_array::value; +using namespace tensor_array::value; int main() { @@ -59,13 +77,14 @@ The `Tensor::get_grad()` method can get the gradient after call `Tensor::calc_gr ```C++ #include -#include "tensor_array/core/tensor.hh" +#include -using tensor_array::value; +using namespace std; +using namespace tensor_array::value; int main() { - tensor_array::value::TensorArray example_tensor_array = + TensorArray example_tensor_array = {{ {{ 1, 2, 3, 4 }}, {{ 5, 6, 7, 8 }}, @@ -76,13 +95,11 @@ int main() Tensor example_tensor_1(example_tensor_array); Tensor example_tensor_2(example_tensor_array_scalar); Tensor example_tensor_sum = example_tensor_1 + example_tensor_2; - std::cout << example_tensor_sum << std::endl; + cout << example_tensor_sum << endl; example_tensor_sum.calc_grad(); - std::cout << example_tensor_1.get_grad() << std::endl; - std::cout << example_tensor_2.get_grad() << std::endl; + cout << example_tensor_1.get_grad() << endl; + cout << example_tensor_2.get_grad() << endl; return 0; } ``` - - diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..1ae7a71 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,11 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +| ------- | ------------------ | +| 2025 | :white_check_mark: | +| 2024 | :x: | +| 2023 | :x: | + +## Reporting a Vulnerability diff --git a/cmake/ta_add_doxygen.cmake b/cmake/ta_add_doxygen.cmake new file mode 100644 index 0000000..94a1bf6 --- /dev/null +++ b/cmake/ta_add_doxygen.cmake @@ -0,0 +1,11 @@ +find_package(Doxygen) + +if(Doxygen_FOUND) + set(DOXYGEN_GENERATE_HTML YES) + set(DOXYGEN_FILE_PATTERNS *.c *.cc *.h *.hh *.cu *.md) + set(DOXYGEN_EXTENSION_MAPPING "*.cu=c++") + set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "${PROJECT_SOURCE_DIR}/README.md") + set(DOXYGEN_EXCLUDE_PATTERNS ".*/*" "build/*" "cmake/*" "scripts/*" "CMakeLists.txt" "*.cmake" "*.cmake.*") + + doxygen_add_docs(tensorarray_docs ${PROJECT_SOURCE_DIR} ALL) +endif() diff --git a/cmake/ta_core_config.cmake b/cmake/ta_core_config.cmake new file mode 100644 index 0000000..e1a2c2a --- /dev/null +++ b/cmake/ta_core_config.cmake @@ -0,0 +1,94 @@ +set(TensorArray_Core_Dir tensor-array/core) + +file( + GLOB TensorArray_Core_inc + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.h" + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.hh" +) + +install( + FILES ${TensorArray_Core_inc} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${TensorArray_Core_Dir}" + COMPONENT headers +) + +include(CheckLanguage) +check_language(CUDA) + +file(GLOB TensorArray_Core_cc "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.cc") + +if (CMAKE_CUDA_COMPILER) + file(GLOB TensorArray_Core_cu "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.cu") +endif() + +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + add_library(tensorarray_core_object OBJECT ${TensorArray_Core_cc} ${TensorArray_Core_cu}) + set_property(TARGET tensorarray_core_object PROPERTY CUDA_STANDARD 17) + set_property(TARGET tensorarray_core_object PROPERTY CUDA_STANDARD_REQUIRED ON) + set_property(TARGET tensorarray_core_object PROPERTY CUDA_EXTENSIONS OFF) + set_property(TARGET tensorarray_core_object PROPERTY CUDA_SEPARABLE_COMPILATION ON) + target_include_directories(tensorarray_core_object PRIVATE $<$:${CUDAToolkit_INCLUDE_DIRS}>) + if(MSVC) + target_compile_definitions(tensorarray_core_object PRIVATE TENSOR_ARRAY_CORE_EXPORTS) + endif() + + # find_package(CUDAToolkit REQUIRED) + # set(CMAKE_CUDA_ARCHITECTURES 52 75 89) + # set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + # list(APPEND CMAKE_CUDA_FLAGS "--default-stream per-thread") +else() + add_library(tensorarray_core_object OBJECT ${TensorArray_Core_cc}) +endif() + +# file(MAKE_DIRECTORY "include/tensor_array/core") + +set_property(TARGET tensorarray_core_object PROPERTY C_STANDARD 11) +set_property(TARGET tensorarray_core_object PROPERTY C_STANDARD_REQUIRED ON) +set_property(TARGET tensorarray_core_object PROPERTY C_EXTENSIONS OFF) + +set_property(TARGET tensorarray_core_object PROPERTY CXX_STANDARD 17) +set_property(TARGET tensorarray_core_object PROPERTY CXX_STANDARD_REQUIRED ON) +set_property(TARGET tensorarray_core_object PROPERTY CXX_EXTENSIONS OFF) + +# shared libraries need PIC +set_property(TARGET tensorarray_core_object PROPERTY POSITION_INDEPENDENT_CODE 1) + +# shared and static libraries built from the same object files +add_library(tensorarray_core SHARED $) +add_library(tensorarray_core_static STATIC $) + +if(CUDAToolkit_FOUND) + set_property(TARGET tensorarray_core PROPERTY CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries( + tensorarray_core + PRIVATE $<$:CUDA::cublas> + ) +endif() + +install( + TARGETS tensorarray_core + EXPORT TensorArrayTargets + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + COMPONENT Runtime + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Runtime + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Development +) + +install( + TARGETS tensorarray_core_static + EXPORT TensorArrayTargets + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + COMPONENT Runtime + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Runtime + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Development +) + +add_library(TensorArray::core ALIAS tensorarray_core) +add_library(TensorArray::core_static ALIAS tensorarray_core_static) +add_library(TensorArray::core_object ALIAS tensorarray_core_object) diff --git a/cmake/ta_interp_config.cmake b/cmake/ta_interp_config.cmake new file mode 100644 index 0000000..9f43e0d --- /dev/null +++ b/cmake/ta_interp_config.cmake @@ -0,0 +1,50 @@ +set(TensorArray_Interpreter_Dir tensor-array/interp) + +file( + GLOB TensorArray_Interpreter_inc + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.h" + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.hh" +) + +install( + FILES ${TensorArray_Interpreter_inc} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${TensorArray_Interpreter_Dir}" + COMPONENT headers +) + +file( + GLOB TensorArray_Interpreter_src + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.c" + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.cc" +) +add_executable(tensorarray_interpreter ${TensorArray_Interpreter_src}) + +target_include_directories(tensorarray_interpreter PRIVATE ${PROJECT_SOURCE_DIR}/src) +target_link_libraries(tensorarray_interpreter PUBLIC TensorArray::core) + +set_property(TARGET tensorarray_interpreter PROPERTY C_STANDARD 11) +set_property(TARGET tensorarray_interpreter PROPERTY C_STANDARD_REQUIRED ON) +set_property(TARGET tensorarray_interpreter PROPERTY C_EXTENSIONS OFF) + +set_property(TARGET tensorarray_interpreter PROPERTY CXX_STANDARD 17) +set_property(TARGET tensorarray_interpreter PROPERTY CXX_STANDARD_REQUIRED ON) +set_property(TARGET tensorarray_interpreter PROPERTY CXX_EXTENSIONS OFF) + +install( + TARGETS tensorarray_interpreter + EXPORT TensorArrayTargets + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + COMPONENT Runtime + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Runtime + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Development +) + #[[ + add_custom_command( + OUTPUT test.tmp + DEPENDS tensorarray_interpreter + POST_BUILD + COMMAND tensorarray_interpreter) + ]] +add_executable(TensorArray::interpreter ALIAS tensorarray_interpreter) diff --git a/cmake/ta_layers_config.cmake b/cmake/ta_layers_config.cmake new file mode 100644 index 0000000..d0961a9 --- /dev/null +++ b/cmake/ta_layers_config.cmake @@ -0,0 +1,66 @@ +set(TensorArray_Layers_Dir tensor-array/layers) + +file( + GLOB TensorArray_Layers_inc + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Layers_Dir}/*.h" + "${PROJECT_SOURCE_DIR}/src/${TensorArray_Layers_Dir}/*.hh" +) + +install( + FILES ${TensorArray_Layers_inc} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${TensorArray_Layers_Dir}" + COMPONENT headers +) + +file(GLOB TensorArray_Layers_src "${PROJECT_SOURCE_DIR}/src/${TensorArray_Layers_Dir}/*.cc") + +add_library(tensorarray_layers_object OBJECT ${TensorArray_Layers_src}) + +target_include_directories(tensorarray_layers_object PRIVATE ${PROJECT_SOURCE_DIR}/src) + +set_property(TARGET tensorarray_layers_object PROPERTY C_STANDARD 11) +set_property(TARGET tensorarray_layers_object PROPERTY C_STANDARD_REQUIRED ON) +set_property(TARGET tensorarray_layers_object PROPERTY C_EXTENSIONS OFF) + +set_property(TARGET tensorarray_layers_object PROPERTY CXX_STANDARD 17) +set_property(TARGET tensorarray_layers_object PROPERTY CXX_STANDARD_REQUIRED ON) +set_property(TARGET tensorarray_layers_object PROPERTY CXX_EXTENSIONS OFF) + +# shared libraries need PIC +set_property(TARGET tensorarray_layers_object PROPERTY POSITION_INDEPENDENT_CODE 1) + +if(MSVC) + target_compile_definitions(tensorarray_layers_object PRIVATE TENSOR_ARRAY_LAYERS_EXPORTS) +endif() + +# shared and static libraries built from the same object files +add_library(tensorarray_layers SHARED $) +add_library(tensorarray_layers_static STATIC $) + +target_link_libraries(tensorarray_layers PUBLIC TensorArray::core) + +install( + TARGETS tensorarray_layers + EXPORT TensorArrayTargets + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + COMPONENT Runtime + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Runtime + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Development +) + +install( + TARGETS tensorarray_layers_static + EXPORT TensorArrayTargets + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + COMPONENT Runtime + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Runtime + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array" + COMPONENT Development +) + +add_library(TensorArray::layers ALIAS tensorarray_layers) +add_library(TensorArray::layers_static ALIAS tensorarray_layers_static) +add_library(TensorArray::layers_object ALIAS tensorarray_layers_object) diff --git a/scripts/actions/install-cuda-rhel.sh b/scripts/actions/install-cuda-rhel.sh new file mode 100644 index 0000000..65e8b1b --- /dev/null +++ b/scripts/actions/install-cuda-rhel.sh @@ -0,0 +1,152 @@ +CUDA_PACKAGES_IN=( + "cuda-compiler" + "cuda-cudart-devel" + "cuda-nvtx" + "cuda-nvrtc" + "cuda-cccl" + "libcurand-devel" + "libcublas-devel" + "libcufft-devel" +) + + +function version_ge() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$2" ] +} +# returns 0 (true) if a > b +function version_gt() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$1" = "$2" ] && return 1 || version_ge $1 $2 +} +# returns 0 (true) if a <= b +function version_le() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$1" ] +} +# returns 0 (true) if a < b +function version_lt() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$1" = "$2" ] && return 1 || version_le $1 $2 +} + + +LINUX_ID=$(lsb_release -si) +LINUX_ID="${LINUX_ID,,}" + +LINUX_VERSION=$(lsb_release -sr) +LINUX_VERSION="${LINUX_VERSION//.}" + +LINUX_VERSION_MAJOR_MINOR=$(lsb_release -sr) +LINUX_MAJOR=$(echo "${LINUX_VERSION_MAJOR_MINOR}" | cut -d. -f1) +LINUX_MINOR=$(echo "${LINUX_VERSION_MAJOR_MINOR}" | cut -d. -f2) +LINUX_PATCH=$(echo "${LINUX_VERSION_MAJOR_MINOR}" | cut -d. -f3) + +YUM_PACKAGE_MANAGER="yum" +YUM_CONFIG_MANAGER="yum-config-manager" + +if [[ "${LINUX_ID}" == "almalinux" || "${LINUX_ID}" == "centos" || "${LINUX_ID}" == "oracle" ]]; then + echo "LINUX_ID: ${LINUX_ID} change to rhel" + LINUX_ID="rhel" + if [[ "${LINUX_MAJOR}" -ge "8" ]]; then + YUM_PACKAGE_MANAGER="dnf" + YUM_CONFIG_MANAGER="dnf config-manager" + fi + LINUX_VERSION=${LINUX_MAJOR} +fi + +LOCATION_TEMP=${temp} + +CUDA_VERSION_MAJOR_MINOR=${cuda} + +CUDA_MAJOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f1) +CUDA_MINOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f2) +CUDA_PATCH=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f3) + +CPU_ARCH=$(uname -m) +if [[ "${CPU_ARCH}" == "aarch64" ]] +then + CPU_ARCH="sbsa" +fi + +for package in "${CUDA_PACKAGES_IN[@]}" +do : + # @todo This is not perfect. Should probably provide a separate list for diff versions + # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y + if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" + then + package="cuda-compiler" + elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" + then + package="cuda-nvcc" + # CUB/Thrust are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+ + elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]] + then + # CUDA cuda-thrust >= 11.4 + if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" + then + package="cuda-cccl" + # Use cuda-thrust > 11.2 + elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" + then + package="cuda-thrust" + # Do not include this pacakge < 11.3 + else + continue + fi + fi + # Build the full package name and append to the string. + CUDA_PACKAGES+=" ${package}-${CUDA_MAJOR}-${CUDA_MINOR}" +done +echo "CUDA_PACKAGES ${CUDA_PACKAGES}" + +REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/cuda-${LINUX_ID}${LINUX_VERSION}.repo" + +is_root=false +if (( $EUID == 0)) +then + is_root=true +fi +# Find if sudo is available +has_sudo=false +if command -v sudo &> /dev/null +then + has_sudo=true +fi +# Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. +if [ "$is_root" = false ] && [ "$has_sudo" = false ] +then + echo "Root or sudo is required. Aborting." + exit 1 +elif [ "$is_root" = false ] +then + USE_SUDO=sudo +else + USE_SUDO= +fi + +echo "Adding CUDA Repository" +$USE_SUDO $YUM_CONFIG_MANAGER --add-repo ${REPO_URL} +$USE_SUDO $YUM_PACKAGE_MANAGER clean all + +$USE_SUDO $YUM_PACKAGE_MANAGER install -y ${CUDA_PACKAGES} + +if [[ $? -ne 0 ]] +then + echo "CUDA Installation Error." + exit 1 +fi + +CUDA_PATH=/usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} +echo "CUDA_PATH=${CUDA_PATH}" +export CUDA_PATH=${CUDA_PATH} +export PATH="$PATH:$CUDA_PATH/bin" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib64" + +if [[ $GITHUB_ACTIONS ]] +then + echo "${CUDA_PATH}/bin" >> $GITHUB_PATH + echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV +fi diff --git a/scripts/actions/install-cuda-ubuntu.sh b/scripts/actions/install-cuda-ubuntu.sh index 45d85dd..34a507f 100644 --- a/scripts/actions/install-cuda-ubuntu.sh +++ b/scripts/actions/install-cuda-ubuntu.sh @@ -3,10 +3,10 @@ CUDA_PACKAGES_IN=( "cuda-cudart" "cuda-nvtx" "cuda-nvrtc" + "cuda-cccl" "libcurand-dev" "libcublas-dev" "libcufft-dev" - "cuda-cccl" ) function version_ge() { @@ -44,22 +44,34 @@ CUDA_MAJOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f1) CUDA_MINOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f2) CUDA_PATCH=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f3) +CPU_ARCH=$(uname -m) +if [[ "${CPU_ARCH}" == "aarch64" ]] +then + CPU_ARCH="sbsa" +fi + + CUDA_PACKAGES="" for package in "${CUDA_PACKAGES_IN[@]}" do : # @todo This is not perfect. Should probably provide a separate list for diff versions # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y - if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then + if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" + then package="cuda-compiler" - elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then + elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" + then package="cuda-nvcc" # CUB/Thrust are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+ - elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]; then + elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]] + then # CUDA cuda-thrust >= 11.4 - if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" ; then + if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" + then package="cuda-cccl" # Use cuda-thrust > 11.2 - elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" ; then + elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" + then package="cuda-thrust" # Do not include this pacakge < 11.3 else @@ -71,27 +83,30 @@ do : done echo "CUDA_PACKAGES ${CUDA_PACKAGES}" -CPU_ARCH="x86_64" PIN_FILENAME="cuda-${LINUX_ID}${LINUX_VERSION}.pin" PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/${PIN_FILENAME}" -KERYRING_PACKAGE_FILENAME="cuda-keyring_1.0-1_all.deb" +KERYRING_PACKAGE_FILENAME="cuda-keyring_1.1-1_all.deb" KEYRING_PACKAGE_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/${KERYRING_PACKAGE_FILENAME}" REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/" is_root=false -if (( $EUID == 0)); then +if (( $EUID == 0)) +then is_root=true fi # Find if sudo is available has_sudo=false -if command -v sudo &> /dev/null ; then +if command -v sudo &> /dev/null +then has_sudo=true fi # Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. -if [ "$is_root" = false ] && [ "$has_sudo" = false ]; then +if [ "$is_root" = false ] && [ "$has_sudo" = false ] +then echo "Root or sudo is required. Aborting." exit 1 -elif [ "$is_root" = false ] ; then +elif [ "$is_root" = false ] +then USE_SUDO=sudo else USE_SUDO= @@ -106,7 +121,8 @@ $USE_SUDO apt-get update $USE_SUDO apt-get -y install ${CUDA_PACKAGES} -if [[ $? -ne 0 ]]; then +if [[ $? -ne 0 ]] +then echo "CUDA Installation Error." exit 1 fi @@ -114,14 +130,14 @@ fi CUDA_PATH=/usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} echo "CUDA_PATH=${CUDA_PATH}" export CUDA_PATH=${CUDA_PATH} -export PATH="$CUDA_PATH/bin:$PATH" -export LD_LIBRARY_PATH="$CUDA_PATH/lib:$LD_LIBRARY_PATH" -export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH" +export PATH="$PATH:$CUDA_PATH/bin" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib64" -if [[ $GITHUB_ACTIONS ]]; then - # Set paths for subsequent steps, using ${CUDA_PATH} +if [[ $GITHUB_ACTIONS ]] +then echo "Adding CUDA to CUDA_PATH, PATH and LD_LIBRARY_PATH" echo "${CUDA_PATH}/bin" >> $GITHUB_PATH - echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV fi diff --git a/scripts/actions/install-cuda-windows.ps1 b/scripts/actions/install-cuda-windows.ps1 new file mode 100644 index 0000000..c47930b --- /dev/null +++ b/scripts/actions/install-cuda-windows.ps1 @@ -0,0 +1,56 @@ +$CUDA_PACKAGES_IN = @( + "nvcc" + "visual_studio_integration" + "cudart" + "nvtx" + "nvrtc" + "thrust" + "curand_dev" + "cublas_dev" + "cufft_dev" +) + +function Version-Ge($a, $b) { + return ([version]$a -ge [version]$b) +} +function Version-Gt($a, $b) { + return ([version]$a -gt [version]$b) +} +function Version-Le($a, $b) { + return ([version]$a -le [version]$b) +} +function Version-Lt($a, $b) { + return ([version]$a -lt [version]$b) +} + +# Expect $env:cuda to be set, e.g. "12.4.1" +$CUDA_VERSION_MAJOR_MINOR = $env:cuda + +$parts = $CUDA_VERSION_MAJOR_MINOR.Split('.') +$CUDA_MAJOR = $parts[0] +$CUDA_MINOR = $parts[1] +$CUDA_PATCH = if ($parts.Count -gt 2) { $parts[2] } else { "0" } + +$CUDA_PACKAGES = "" +foreach ($package in $CUDA_PACKAGES_IN) { + $CUDA_PACKAGES += " ${package}_${CUDA_MAJOR}.${CUDA_MINOR}" +} +Write-Host "CUDA_PACKAGES $CUDA_PACKAGES" + +$cudaInstallerUrl = "https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION_MAJOR_MINOR/network_installers/cuda_${CUDA_VERSION_MAJOR_MINOR}_windows_network.exe" +Invoke-WebRequest -Uri $cudaInstallerUrl -OutFile "cuda_installer.exe" +Start-Process -FilePath ".\cuda_installer.exe" -ArgumentList "-s $CUDA_PACKAGES" -Wait +Remove-Item "cuda_installer.exe" -Force + +$CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_MAJOR.$CUDA_MINOR" +Write-Host "CUDA_PATH=$CUDA_PATH" +$env:CUDA_PATH = $CUDA_PATH + +# If executing on github actions, emit the appropriate echo statements to update environment variables +if (Test-Path "env:GITHUB_ACTIONS") { + # Set paths for subsequent steps, using $env:CUDA_PATH + Write-Host "Adding CUDA to CUDA_PATH, and PATH" + Add-Content -Path $env:GITHUB_ENV -Value "CUDA_PATH=$env:CUDA_PATH" + Add-Content -Path $env:GITHUB_ENV -Value "CUDA_PATH_V${CUDA_MAJOR}_${CUDA_MINOR}=$env:CUDA_PATH" + Add-Content -Path $env:GITHUB_PATH -Value "$env:CUDA_PATH\bin" +} diff --git a/scripts/actions/install-rocm-ubuntu.sh b/scripts/actions/install-rocm-ubuntu.sh new file mode 100644 index 0000000..7039c8d --- /dev/null +++ b/scripts/actions/install-rocm-ubuntu.sh @@ -0,0 +1,122 @@ +ROCM_PACKAGES_IN=( + rocm-hip-runtime-dev +) + +function version_ge() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$2" ] +} +# returns 0 (true) if a > b +function version_gt() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$1" = "$2" ] && return 1 || version_ge $1 $2 +} +# returns 0 (true) if a <= b +function version_le() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$1" ] +} +# returns 0 (true) if a < b +function version_lt() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$1" = "$2" ] && return 1 || version_le $1 $2 +} + + +LINUX_ID=$(lsb_release -si) +LINUX_ID="${LINUX_ID,,}" + +LINUX_VERSION=$(lsb_release -sr) +LINUX_VERSION="${LINUX_VERSION//.}" + +LINUX_CODENAME=$(lsb_release -cs) +LINUX_CODENAME="${LINUX_CODENAME,,}" + +LOCATION_TEMP=${temp} + +ROCM_VERSION_MAJOR_MINOR=${rocm} + +CPU_ARCH=$(uname -m) +if [[ "${CPU_ARCH}" == "aarch64" ]] +then + CPU_ARCH="sbsa" +fi + + +ROCM_PACKAGES="" +for package in "${ROCM_PACKAGES_IN[@]}" +do : + # Build the full package name and append to the string. + ROCM_PACKAGES+=" ${package}" +done +echo "ROCM_PACKAGES ${ROCM_PACKAGES}" + +GPG_FILENAME="rocm.gpg.key" +GPG_URL="https://repo.radeon.com/rocm/${GPG_FILENAME}" +REPO_URL="https://repo.radeon.com/rocm/apt/${rocm}/" + +is_root=false +if (( $EUID == 0)) +then + is_root=true +fi +# Find if sudo is available +has_sudo=false +if command -v sudo &> /dev/null +then + has_sudo=true +fi +# Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. +if [ "$is_root" = false ] && [ "$has_sudo" = false ] +then + echo "Root or sudo is required. Aborting." + exit 1 +elif [ "$is_root" = false ] +then + USE_SUDO=sudo +else + USE_SUDO= +fi + +KEYRINGS_DIR=/etc/apt/keyrings + +if [ ! -e $KEYRINGS_DIR ] +then + echo "Create directory: ${KEYRINGS_DIR}" + $USE_SUDO mkdir --parents --mode=0755 ${KEYRINGS_DIR} +fi + +ROCM_GPG_KEYRING=${KEYRINGS_DIR}/rocm.gpg + +echo "Adding ROCm Repository:" +wget ${GPG_URL} -O - | \ + gpg --dearmor | $USE_SUDO tee ${ROCM_GPG_KEYRING} > /dev/null +echo "deb [arch=amd64 signed-by=${ROCM_GPG_KEYRING}] ${REPO_URL} ${LINUX_CODENAME} main" \ + | $USE_SUDO tee /etc/apt/sources.list.d/rocm.list +echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \ + | $USE_SUDO tee /etc/apt/preferences.d/rocm-pin-600 +echo "Adding ROCm Repository completed." +$USE_SUDO apt-get update + +$USE_SUDO apt-get -y install ${ROCM_PACKAGES} + +if [[ $? -ne 0 ]] +then + echo "ROCm Installation Error." + exit 1 +fi + +ROCM_PATH=/opt/rocm-${rocm} +echo "ROCM_PATH=${ROCM_PATH}" +export ROCM_PATH=${ROCM_PATH} +export PATH="$PATH:$ROCM_PATH/bin" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$ROCM_PATH/lib" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$ROCM_PATH/lib64" + +if [[ $GITHUB_ACTIONS ]] +then + echo "Adding ROCM to ROCM_PATH, PATH and LD_LIBRARY_PATH" + echo "${ROCM_PATH}/bin" >> $GITHUB_PATH + echo "ROCM_PATH=${ROCM_PATH}" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV +fi diff --git a/scripts/packages-install/reinstall-cmake-rhel.sh b/scripts/packages-install/reinstall-cmake-rhel.sh new file mode 100644 index 0000000..0b7ee15 --- /dev/null +++ b/scripts/packages-install/reinstall-cmake-rhel.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +#------------------------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. +#------------------------------------------------------------------------------------------------------------- +# +set -e + +CMAKE_VERSION=${1:-"none"} + +if [ "${CMAKE_VERSION}" = "none" ]; then + echo "No CMake version specified, skipping CMake reinstallation" + exit 0 +fi + +# Cleanup temporary directory and associated files when exiting the script. +cleanup() { + EXIT_CODE=$? + set +e + if [[ -n "${TMP_DIR}" ]]; then + echo "Executing cleanup of tmp files" + rm -Rf "${TMP_DIR}" + fi + exit $EXIT_CODE +} +trap cleanup EXIT + + +echo "Installing CMake..." +dnf -y remove cmake +dnf -y autoremove +mkdir -p /opt/cmake + +architecture=$(arch) +ARCH=${architecture} + +CMAKE_BINARY_NAME="cmake-${CMAKE_VERSION}-linux-${ARCH}.sh" +CMAKE_CHECKSUM_NAME="cmake-${CMAKE_VERSION}-SHA-256.txt" +TMP_DIR=$(mktemp -d -t cmake-XXXXXXXXXX) + +echo "${TMP_DIR}" +cd "${TMP_DIR}" + +curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_BINARY_NAME}" -O +curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_CHECKSUM_NAME}" -O + +sha256sum -c --ignore-missing "${CMAKE_CHECKSUM_NAME}" +sh "${TMP_DIR}/${CMAKE_BINARY_NAME}" --prefix=/opt/cmake --skip-license + +ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake +ln -s /opt/cmake/bin/ctest /usr/local/bin/ctest diff --git a/scripts/packages-install/reinstall-cmake-ubuntu.sh b/scripts/packages-install/reinstall-cmake-ubuntu.sh new file mode 100644 index 0000000..408b81d --- /dev/null +++ b/scripts/packages-install/reinstall-cmake-ubuntu.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +#------------------------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. +#------------------------------------------------------------------------------------------------------------- +# +set -e + +CMAKE_VERSION=${1:-"none"} + +if [ "${CMAKE_VERSION}" = "none" ]; then + echo "No CMake version specified, skipping CMake reinstallation" + exit 0 +fi + +# Cleanup temporary directory and associated files when exiting the script. +cleanup() { + EXIT_CODE=$? + set +e + if [[ -n "${TMP_DIR}" ]]; then + echo "Executing cleanup of tmp files" + rm -Rf "${TMP_DIR}" + fi + exit $EXIT_CODE +} +trap cleanup EXIT + + +echo "Installing CMake..." +apt-get -y purge --auto-remove cmake +mkdir -p /opt/cmake + +architecture=$(dpkg --print-architecture) +case "${architecture}" in + arm64) + ARCH=aarch64 ;; + amd64) + ARCH=x86_64 ;; + *) + echo "Unsupported architecture ${architecture}." + exit 1 + ;; +esac + +CMAKE_BINARY_NAME="cmake-${CMAKE_VERSION}-linux-${ARCH}.sh" +CMAKE_CHECKSUM_NAME="cmake-${CMAKE_VERSION}-SHA-256.txt" +TMP_DIR=$(mktemp -d -t cmake-XXXXXXXXXX) + +echo "${TMP_DIR}" +cd "${TMP_DIR}" + +curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_BINARY_NAME}" -O +curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_CHECKSUM_NAME}" -O + +sha256sum -c --ignore-missing "${CMAKE_CHECKSUM_NAME}" +sh "${TMP_DIR}/${CMAKE_BINARY_NAME}" --prefix=/opt/cmake --skip-license + +ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake +ln -s /opt/cmake/bin/ctest /usr/local/bin/ctest diff --git a/src/tensor_array/core/data_type_wrapper.cu b/src/tensor-array/core/data_type_wrapper.cc similarity index 86% rename from src/tensor_array/core/data_type_wrapper.cu rename to src/tensor-array/core/data_type_wrapper.cc index 95d7cbe..478f0c6 100644 --- a/src/tensor_array/core/data_type_wrapper.cu +++ b/src/tensor-array/core/data_type_wrapper.cc @@ -32,12 +32,12 @@ limitations under the License. typedef __nv_bfloat16 bfloat16; -#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3) -#define USING_DATA_TYPE_NVIDIA_FLOAT (half)(bfloat16) -#define USING_DATA_TYPE_FLOAT (float)(double) -#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t) -#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t) -#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT USING_DATA_TYPE_NVIDIA_FLOAT +#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3) +#define USING_DATA_TYPE_NVIDIA_FLOAT() (half)(bfloat16) +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() USING_DATA_TYPE_NVIDIA_FLOAT() namespace tensor_array { diff --git a/src/tensor_array/core/data_type_wrapper.hh b/src/tensor-array/core/data_type_wrapper.hh similarity index 100% rename from src/tensor_array/core/data_type_wrapper.hh rename to src/tensor-array/core/data_type_wrapper.hh diff --git a/src/tensor_array/core/devices.cu b/src/tensor-array/core/devices.cc similarity index 99% rename from src/tensor_array/core/devices.cu rename to src/tensor-array/core/devices.cc index c3a1694..fce05f8 100644 --- a/src/tensor_array/core/devices.cu +++ b/src/tensor-array/core/devices.cc @@ -14,11 +14,13 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include "devices.hh" +#include +#include #include #include #include -#include namespace tensor_array { diff --git a/src/tensor_array/core/devices.hh b/src/tensor-array/core/devices.hh similarity index 82% rename from src/tensor_array/core/devices.hh rename to src/tensor-array/core/devices.hh index ca49598..e3aeb41 100644 --- a/src/tensor_array/core/devices.hh +++ b/src/tensor-array/core/devices.hh @@ -16,14 +16,14 @@ limitations under the License. #pragma once -#ifdef __WIN32__ -#ifdef CUDA_ML_EXPORTS -#define CUDA_ML_API __declspec(dllexport) +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) #else -#define CUDA_ML_API __declspec(dllimport) +#define TENSOR_ARRAY_API __declspec(dllimport) #endif #else -#define CUDA_ML_API +#define TENSOR_ARRAY_API #endif namespace tensor_array @@ -33,7 +33,7 @@ namespace tensor_array enum DeviceType { CPU, - CUDA + CUDA, }; struct Device @@ -44,7 +44,7 @@ namespace tensor_array constexpr Device DEVICE_CPU_0{ CPU,0 }; - CUDA_ML_API Device& local_device(); + TENSOR_ARRAY_API Device& local_device(); void device_memcpy(void*, Device, const void*, Device, size_t); @@ -54,7 +54,7 @@ namespace tensor_array void device_memset(void*, Device, int, size_t, void*); - CUDA_ML_API void device_CUDA_get_info(); + TENSOR_ARRAY_API void device_CUDA_get_info(); } } @@ -66,4 +66,4 @@ void operator delete(void*, tensor_array::devices::Device); void operator delete(void*, tensor_array::devices::Device, void*); -#undef CUDA_ML_API \ No newline at end of file +#undef TENSOR_ARRAY_API \ No newline at end of file diff --git a/src/tensor_array/core/extern_type_map.cc b/src/tensor-array/core/extern_type_map.cc similarity index 100% rename from src/tensor_array/core/extern_type_map.cc rename to src/tensor-array/core/extern_type_map.cc diff --git a/src/tensor-array/core/extern_type_map.hh b/src/tensor-array/core/extern_type_map.hh new file mode 100644 index 0000000..9e33356 --- /dev/null +++ b/src/tensor-array/core/extern_type_map.hh @@ -0,0 +1,41 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include + +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_CORE_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + +namespace tensor_array +{ + namespace value + { + /** + * Map of data types. + */ + extern TENSOR_ARRAY_API std::unordered_map dynamic_type_size; + } +} + +#undef TENSOR_ARRAY_API diff --git a/src/tensor-array/core/initializer_wrapper.hh b/src/tensor-array/core/initializer_wrapper.hh new file mode 100644 index 0000000..8245f0b --- /dev/null +++ b/src/tensor-array/core/initializer_wrapper.hh @@ -0,0 +1,96 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +namespace tensor_array +{ + namespace wrapper + { +#ifdef _MSC_VER + template + class initializer_wrapper: public std::initializer_list<_E> + { + public: + typedef _E value_type; + typedef const _E& reference; + typedef const _E& const_reference; + typedef size_t size_type; + typedef const _E* iterator; + typedef const _E* const_iterator; + public: + constexpr initializer_wrapper(const_iterator __a, size_type __l) + : std::initializer_list<_E>(__a, __a + __l) { } + + constexpr initializer_wrapper(const_iterator __begin, const_iterator __end) + : std::initializer_list<_E>(__begin, __end) { } + + constexpr initializer_wrapper() noexcept: std::initializer_list<_E>() { } + }; +#else + template + class initializer_wrapper + { + public: + typedef _E value_type; + typedef const _E& reference; + typedef const _E& const_reference; + typedef size_t size_type; + typedef const _E* iterator; + typedef const _E* const_iterator; + + private: +#ifdef __GNUC__ + iterator _M_array; + size_type _M_len; +#endif + + public: + constexpr initializer_wrapper(const_iterator __a, size_type __l) +#ifdef __GNUC__ + : _M_array(__a), _M_len(__l) { } +#endif + + constexpr initializer_wrapper(const_iterator __begin, const_iterator __end) + : _M_array(__begin), _M_len(__end - __begin) + { } + + constexpr initializer_wrapper() noexcept: _M_array(0), _M_len(0) { } + + // Number of elements. + constexpr size_type + size() const noexcept + { + return _M_len; + } + + // First element. + constexpr const_iterator + begin() const noexcept { + return _M_array; + } + + // One past the last element. + constexpr const_iterator + end() const noexcept { + return begin() + size(); + } + + constexpr operator std::initializer_list<_E>() const { return reinterpret_cast&>(*this); } + }; +#endif // !_MSC_VER + } +} \ No newline at end of file diff --git a/src/tensor_array/core/tensor.cc b/src/tensor-array/core/tensor.cc similarity index 97% rename from src/tensor_array/core/tensor.cc rename to src/tensor-array/core/tensor.cc index 9b85cac..7d10a75 100644 --- a/src/tensor_array/core/tensor.cc +++ b/src/tensor-array/core/tensor.cc @@ -14,23 +14,23 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include -#include -#include "data_type_wrapper.hh" #ifndef TENSOR_CONTENT #define TENSOR_CONTENT #include "tensor.hh" #undef TENSOR_CONTENT #endif // !TENSOR_CONTENT + +#include +#include #include +#include +#include +#include "data_type_wrapper.hh" -#define USING_DATA_TYPE_FLOAT (float)(double) -#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t) -#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t) -#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() #define LOOP(seq) END(A seq) #define BODY(x) ADD_CODE(x) @@ -47,6 +47,16 @@ namespace tensor_array { bool use_grad = true; + bool is_use_grad() + { + return use_grad; + } + + void set_use_grad(bool use) + { + use_grad = use; + } + class Tensor::TensorContent { private: @@ -140,7 +150,7 @@ namespace tensor_array void Tensor::TensorContent::reset_grad() { - std::lock_guard tensor_lock(this->tensor_mutex); + std::lock_guard tensor_lock(this->tensor_mutex); this->grad = zeros(this->buf.shape()).tensor_cast(this->buf.type()).get_buffer(); } @@ -159,7 +169,7 @@ namespace tensor_array this->TensorContent::calc_grad(grad); if (this->can_calc_grad && this->forward_back.empty()) { - std::lock_guard tensor_lock(this->tensor_mutex); + std::lock_guard tensor_lock(this->tensor_mutex); std::forward_list thread_list; for (auto& dat : this->derive_data) if (this->derive_multithread) @@ -174,7 +184,7 @@ namespace tensor_array void Tensor::TensorContent::calc_grad(const Tensor& grad) { - std::lock_guard tensor_lock(this->tensor_mutex); + std::lock_guard tensor_lock(this->tensor_mutex); this->grad = add(this->grad, grad, false).get_buffer(); } @@ -468,8 +478,10 @@ temp_check_data_type = TEMP(temp.first) < TEMP(temp_tensor); return this->tensor_data.use_count(); } + std::mutex calc_grad_mutex; void Tensor::calc_grad() { + std::lock_guard calc_grad_lock(calc_grad_mutex); this->tensor_data->reset_grad(); this->tensor_data->calc_grad(values(this->get_buffer().shape(), 1.f).tensor_cast(this->get_buffer().type())); } @@ -732,26 +744,6 @@ temp_check_data_type = TEMP(temp.first) < TEMP(temp_tensor); return divide(a, b); } - Tensor operator!=(const Tensor& a, const Tensor& b) - { - return ab; - } - - Tensor operator==(const Tensor& a, const Tensor& b) - { - return !(a != b); - } - - Tensor operator>=(const Tensor& a, const Tensor& b) - { - return !(a < b); - } - - Tensor operator<=(const Tensor& a, const Tensor& b) - { - return !(a > b); - } - Tensor Tensor::exp() const { return this->exp(true); @@ -902,6 +894,7 @@ out_stream << static_cast(tensor_out); std::pair broadcast_t = tensor_broadcasting(a, temp_b, 0, 2); return batchedmatmul(broadcast_t.first, broadcast_t.second, true, nullptr); } + throw std::exception(); } Tensor condition(const Tensor& value_bool, const Tensor& value_true, const Tensor& value_false) diff --git a/src/tensor_array/core/tensor.hh b/src/tensor-array/core/tensor.hh similarity index 77% rename from src/tensor_array/core/tensor.hh rename to src/tensor-array/core/tensor.hh index 48b2670..af276d7 100644 --- a/src/tensor_array/core/tensor.hh +++ b/src/tensor-array/core/tensor.hh @@ -16,24 +16,15 @@ limitations under the License. #include #include +#include #include "tensorbase.hh" #pragma once -#ifdef __WIN32__ -#ifdef CUDA_ML_EXPORTS -#define CUDA_ML_API __declspec(dllexport) -#else -#define CUDA_ML_API __declspec(dllimport) -#endif -#else -#define CUDA_ML_API -#endif - -#define USING_DATA_TYPE_FLOAT (float)(double) -#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t) -#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t) -#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() #define LOOP(seq) END(A seq) #define BODY(x) ADD_CODE(x) @@ -44,11 +35,21 @@ limitations under the License. #define END(...) END_(__VA_ARGS__) #define END_(...) __VA_ARGS__##_END +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_CORE_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace value { - extern CUDA_ML_API bool use_grad; + extern TENSOR_ARRAY_API bool use_grad; #ifdef TENSOR_CONTENT void* create_mem_101(std::size_t s, const void* dat); @@ -99,11 +100,17 @@ namespace tensor_array dilation; }; + class Tensor; + + TENSOR_ARRAY_API Tensor tensor_rand(const std::initializer_list&, unsigned int = std::rand()); + + TENSOR_ARRAY_API std::pair tensor_broadcasting(const Tensor&, const Tensor&, unsigned char = 0, unsigned char = 0); + /** * \brief Dynamic derivative tensor. * \brief This class use to calculate the tensor. */ - class CUDA_ML_API Tensor + class TENSOR_ARRAY_API Tensor { public: /** @@ -130,7 +137,7 @@ namespace tensor_array /** * \brief This class can iterate copy child tensor by index and derivate to parent tensor, */ - class CUDA_ML_API Iterator + class TENSOR_ARRAY_API Iterator { public: using iterator_category = std::forward_iterator_tag; @@ -144,8 +151,8 @@ namespace tensor_array Iterator& operator--(); Iterator operator++(int); Iterator operator--(int); - friend bool CUDA_ML_API operator==(const Iterator&, const Iterator&); - friend bool CUDA_ML_API operator!=(const Iterator&, const Iterator&); + friend bool TENSOR_ARRAY_API operator==(const Iterator&, const Iterator&); + friend bool TENSOR_ARRAY_API operator!=(const Iterator&, const Iterator&); private: unsigned long long index; reference_left ref; @@ -183,17 +190,10 @@ namespace tensor_array Tensor reshape(const std::vector&) const; Tensor tensor_cast(const std::type_info&) const; Tensor conv_padding(const dimension&) const; -#ifdef TENSOR_CONTENT - friend Tensor derive_transpose(const Tensor&, const Tensor&, bool, const DataBuffer&); - - friend Tensor derive_reshape_cast(const Tensor&, const Tensor&, bool, const DataBuffer&); -#endif Tensor transpose(unsigned char, unsigned char) const; std::pair max(unsigned char = 0) const; std::pair min(unsigned char = 0) const; - friend std::pair tensor_broadcasting(const Tensor&, const Tensor&, unsigned char, unsigned char); #ifdef TENSOR_CONTENT - friend CUDA_ML_API Tensor add_dim(const std::vector&); #endif bool has_tensor() const; template @@ -225,10 +225,10 @@ namespace tensor_array Tensor& operator/=(const Tensor&); - friend CUDA_ML_API Tensor operator>(const Tensor&, const Tensor&); - friend CUDA_ML_API Tensor operator<(const Tensor&, const Tensor&); - friend CUDA_ML_API Tensor operator&&(const Tensor&, const Tensor&); - friend CUDA_ML_API Tensor operator||(const Tensor&, const Tensor&); + friend TENSOR_ARRAY_API Tensor operator>(const Tensor&, const Tensor&); + friend TENSOR_ARRAY_API Tensor operator<(const Tensor&, const Tensor&); + friend TENSOR_ARRAY_API Tensor operator&&(const Tensor&, const Tensor&); + friend TENSOR_ARRAY_API Tensor operator||(const Tensor&, const Tensor&); Tensor operator!(); Tensor exp() const; Tensor sin() const; @@ -238,11 +238,16 @@ namespace tensor_array Tensor cosh() const; Tensor tanh() const; Tensor sigmoid() const; + Tensor reduce_sum(unsigned char) const; + Tensor reduce_max(unsigned char) const; + Tensor reduce_min(unsigned char) const; Tensor log() const; #ifdef TENSOR_CONTENT - friend Tensor tensor_rand(const std::initializer_list&, unsigned int); + friend Tensor derive_transpose(const Tensor&, const Tensor&, bool, const DataBuffer&); + friend Tensor derive_reshape_cast(const Tensor&, const Tensor&, bool, const DataBuffer&); + friend Tensor add(const Tensor&, const Tensor&, bool); friend Tensor power(const Tensor&, const Tensor&, bool); @@ -264,7 +269,14 @@ namespace tensor_array Tensor tensor_cast(const std::type_info&, bool) const; #endif - friend CUDA_ML_API std::ostream& operator<<(std::ostream&, const Tensor&); + + friend TENSOR_ARRAY_API Tensor add_dim(const std::vector&); + + friend Tensor tensor_rand(const std::initializer_list&, unsigned int/* = std::rand() */); + + friend std::pair tensor_broadcasting(const Tensor&, const Tensor&, unsigned char /*= 0*/, unsigned char /*= 0*/); + + friend TENSOR_ARRAY_API std::ostream& operator<<(std::ostream&, const Tensor&); private: #ifdef TENSOR_CONTENT @@ -294,7 +306,7 @@ namespace tensor_array std::shared_ptr tensor_data; }; - class CUDA_ML_API WeakTensor + class TENSOR_ARRAY_API WeakTensor { public: WeakTensor(const Tensor&); @@ -304,13 +316,13 @@ namespace tensor_array std::weak_ptr tensor_data; }; - CUDA_ML_API dimension operator+(const dimension&, const dimension&); + TENSOR_ARRAY_API dimension operator+(const dimension&, const dimension&); - CUDA_ML_API dimension operator-(const dimension&, const dimension&); + TENSOR_ARRAY_API dimension operator-(const dimension&, const dimension&); - CUDA_ML_API dimension operator*(const dimension&, const dimension&); + TENSOR_ARRAY_API dimension operator*(const dimension&, const dimension&); - CUDA_ML_API dimension operator/(const dimension&, const dimension&); + TENSOR_ARRAY_API dimension operator/(const dimension&, const dimension&); /** * \brief Plus 2 n-d tensors. @@ -318,9 +330,9 @@ namespace tensor_array * \return * Tensor */ - CUDA_ML_API Tensor operator+(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor operator+(const Tensor&, const Tensor&); - CUDA_ML_API Tensor operator-(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor operator-(const Tensor&, const Tensor&); /** * \brief Multiply 2 n-d tensors. @@ -328,19 +340,19 @@ namespace tensor_array * \return * Tensor */ - CUDA_ML_API Tensor operator*(const Tensor&, const Tensor&); - - CUDA_ML_API Tensor operator/(const Tensor&, const Tensor&); - CUDA_ML_API Tensor operator!=(const Tensor&, const Tensor&); - CUDA_ML_API Tensor operator==(const Tensor&, const Tensor&); - CUDA_ML_API Tensor operator>=(const Tensor&, const Tensor&); - CUDA_ML_API Tensor operator<=(const Tensor&, const Tensor&); - CUDA_ML_API Tensor tensor_file_load(const char*); - CUDA_ML_API Tensor power(const Tensor&, const Tensor&); - CUDA_ML_API Tensor add(const Tensor&, const Tensor&); - CUDA_ML_API Tensor multiply(const Tensor&, const Tensor&); - CUDA_ML_API Tensor divide(const Tensor&, const Tensor&); - CUDA_ML_API Tensor dot(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor operator*(const Tensor&, const Tensor&); + + TENSOR_ARRAY_API Tensor operator/(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor operator!=(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor operator==(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor operator>=(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor operator<=(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor tensor_file_load(const char*); + TENSOR_ARRAY_API Tensor power(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor add(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor multiply(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor divide(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor dot(const Tensor&, const Tensor&); /** * \brief Matrix multiplication 2 matrices. * \param a Matrix/Tensor that has size (batch*)m*k. @@ -348,8 +360,8 @@ namespace tensor_array * \return Tensor - Matrix that has size (batch*)m*n. * \exception a.col != b.row */ - CUDA_ML_API Tensor matmul(const Tensor&, const Tensor&); - CUDA_ML_API Tensor condition(const Tensor&, const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor matmul(const Tensor&, const Tensor&); + TENSOR_ARRAY_API Tensor condition(const Tensor&, const Tensor&, const Tensor&); /** * \brief Convolution * \brief Only suport 1D, 2D, 3D convolution @@ -360,17 +372,12 @@ namespace tensor_array * \return * Tensor (N, K, ...) */ - CUDA_ML_API Tensor convolution(const Tensor&, const Tensor&, const dimension& = value::dimension(), const dimension& = value::dimension()); - CUDA_ML_API std::pair tensor_broadcasting(const Tensor&, const Tensor&, unsigned char = 0, unsigned char = 0); - CUDA_ML_API Tensor tensor_rand(const std::initializer_list&, unsigned int = std::rand()); -#define ADD_CODE(TYPE) CUDA_ML_API Tensor values(const std::initializer_list&, TYPE); + TENSOR_ARRAY_API Tensor convolution(const Tensor&, const Tensor&, const dimension& = value::dimension(), const dimension& = value::dimension()); +#define ADD_CODE(TYPE) TENSOR_ARRAY_API Tensor values(const std::initializer_list&, TYPE); LOOP(USING_DATA_TYPE); #undef ADD_CODE -#ifndef TENSOR_CONTENT - CUDA_ML_API Tensor add_dim(const std::vector&); -#endif - CUDA_ML_API const std::type_info& comparison_type(const std::type_info&, const std::type_info&); - CUDA_ML_API Tensor tensor_rand(const std::vector&, unsigned int = std::rand()); + TENSOR_ARRAY_API const std::type_info& comparison_type(const std::type_info&, const std::type_info&); + TENSOR_ARRAY_API Tensor tensor_rand(const std::vector&, unsigned int = std::rand()); #ifdef TENSOR_CONTENT class Derivation @@ -448,4 +455,4 @@ struct std::equal_to #undef USING_DATA_TYPE_SINT #undef USING_DATA_TYPE_UINT -#undef CUDA_ML_API \ No newline at end of file +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/core/tensor_blas.cu b/src/tensor-array/core/tensor_blas.cc similarity index 98% rename from src/tensor_array/core/tensor_blas.cu rename to src/tensor-array/core/tensor_blas.cc index a260ccb..1bb16e4 100644 --- a/src/tensor_array/core/tensor_blas.cu +++ b/src/tensor-array/core/tensor_blas.cc @@ -207,7 +207,10 @@ namespace tensor_array c_ptr, convert_cuda_type(c_type), shape_a.end()[-2], 1, batch_size, convert_cuda_type(c_type), CUBLAS_GEMM_DEFAULT); blasStat = cublasDestroy(blasHandle); - TensorBase value_buf(c_type, { batch_size, shape_a.end()[-2] , shape_b.end()[-1] }, c_ptr, this_cuda); + std::vector out_dims = shape_a; + out_dims[out_dims.size() - 1] = shape_b.end()[-1]; + + TensorBase value_buf(c_type, out_dims, c_ptr, this_cuda); cudaStat = cudaFree(c_ptr); return Tensor(std::move(value_buf), std::move(temp)); } diff --git a/src/tensor_array/core/tensor_cast.cu b/src/tensor-array/core/tensor_cast.cu similarity index 76% rename from src/tensor_array/core/tensor_cast.cu rename to src/tensor-array/core/tensor_cast.cu index 3c3da08..7749258 100644 --- a/src/tensor_array/core/tensor_cast.cu +++ b/src/tensor-array/core/tensor_cast.cu @@ -19,6 +19,7 @@ limitations under the License. #include #include #include +#include #ifndef TENSOR_CONTENT #define TENSOR_CONTENT #include "tensor.hh" @@ -34,22 +35,39 @@ limitations under the License. #define END(...) END_(__VA_ARGS__) #define END_(...) __VA_ARGS__##_END -#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3) -#define USING_DATA_TYPE_NVIDIA_FLOAT (__half)(__nv_bfloat16) -#define USING_DATA_TYPE_FLOAT (float)(double) -#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t) -#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e4m3)(__nv_fp8_e5m2) +#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16) +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint16_t)(uint32_t)(uint64_t) -#define USING_DATA_TYPE_CAST_TO \ -(bool) \ -USING_DATA_TYPE_SINT \ -USING_DATA_TYPE_UINT \ -USING_DATA_TYPE_FLOAT \ -USING_DATA_TYPE_NVIDIA_FLOAT +#if CUDART_VERSION >= 12020 +#define USING_DATA_TYPE_CAST_FROM() \ +USING_DATA_TYPE_SINT() \ +USING_DATA_TYPE_UINT() \ +USING_DATA_TYPE_FLOAT() \ +USING_DATA_TYPE_NVIDIA_FLOAT() +#else +#define USING_DATA_TYPE_CAST_FROM() \ +USING_DATA_TYPE_SINT() \ +USING_DATA_TYPE_UINT() \ +USING_DATA_TYPE_FLOAT() +#endif -#define USING_DATA_TYPE_CAST_FROM \ -USING_DATA_TYPE_CAST_TO \ -USING_DATA_TYPE_NVIDIA_FLOAT_8 +#if CUDART_VERSION >= 12020 +#define USING_DATA_TYPE_CAST_TO() \ +(bool) \ +(int8_t) \ +(uint8_t) \ +USING_DATA_TYPE_NVIDIA_FLOAT_8() \ +USING_DATA_TYPE_CAST_FROM() +#else +#define USING_DATA_TYPE_CAST_TO() \ +(bool) \ +(int8_t) \ +(uint8_t) \ +USING_DATA_TYPE_CAST_FROM() +#endif namespace tensor_array { @@ -85,13 +103,13 @@ namespace tensor_array #define ADD_CODE(TYPE) \ if(this->get_buffer().type() == typeid(TYPE)) \ type_casting<<>>(out_ptr, static_cast(base_of_this.data()), total_size); - LOOP(USING_DATA_TYPE_CAST_TO); + LOOP(USING_DATA_TYPE_CAST_FROM()); #undef ADD_CODE cuda_status = cudaDeviceSynchronize(); cuda_status = cudaGetLastError(); if (cuda_status != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); + std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); } std::type_index test = typeid(T); if (dynamic_type_size.find(test) == dynamic_type_size.end()) @@ -108,7 +126,7 @@ type_casting<<>>(out_ptr, static_cast(base_of_ #define ADD_CODE(TYPE) \ if(dtype == typeid(TYPE)) \ return this->cast(is_derive); - LOOP(USING_DATA_TYPE_CAST_FROM); + LOOP(USING_DATA_TYPE_CAST_TO()); #undef ADD_CODE throw std::exception(); } diff --git a/src/tensor_array/core/tensor_convolution.cc b/src/tensor-array/core/tensor_convolution.cc similarity index 100% rename from src/tensor_array/core/tensor_convolution.cc rename to src/tensor-array/core/tensor_convolution.cc diff --git a/src/tensor_array/core/tensor_convolution.cu b/src/tensor-array/core/tensor_convolution.cu similarity index 95% rename from src/tensor_array/core/tensor_convolution.cu rename to src/tensor-array/core/tensor_convolution.cu index ae2489f..6b3217d 100644 --- a/src/tensor_array/core/tensor_convolution.cu +++ b/src/tensor-array/core/tensor_convolution.cu @@ -17,6 +17,7 @@ limitations under the License. #include #include #include +#include #include #include #include @@ -27,29 +28,29 @@ limitations under the License. #endif #if __CUDA_ARCH__ >= 800 -#define USE_BF16 (__nv_bfloat16) +#define USE_BF16() (__nv_bfloat16) #else -#define USE_BF16 +#define USE_BF16() #endif #if __CUDA_ARCH__ >= 700 -#define USE_FP16 (__half) +#define USE_FP16() (__half) #else -#define USE_FP16 +#define USE_FP16() #endif #if __CUDA_ARCH__ >= 600 -#define USE_FP64 (double) +#define USE_FP64() (double) #else -#define USE_FP64 +#define USE_FP64() #endif -#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3) -#define USING_DATA_TYPE_NVIDIA_FLOAT USE_FP16 USE_BF16 -#define USING_DATA_TYPE_FLOAT (float)USE_FP64 -#define USING_DATA_TYPE_SINT (int32_t) -#define USING_DATA_TYPE_UINT (uint32_t)(unsigned long long int) -#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT USING_DATA_TYPE_NVIDIA_FLOAT +#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3) +#define USING_DATA_TYPE_NVIDIA_FLOAT() USE_FP16() USE_BF16() +#define USING_DATA_TYPE_FLOAT() (float) USE_FP64() +#define USING_DATA_TYPE_SINT() (int32_t) +#define USING_DATA_TYPE_UINT() (uint32_t)(unsigned long long int) +#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() USING_DATA_TYPE_NVIDIA_FLOAT() #define LOOP(seq) END(A seq) #define BODY(x) ADD_CODE(x) @@ -178,7 +179,7 @@ kernel_derive_conv_padding<<>>(static_cast(out_ptr), cuda_status = cudaGetLastError(); if (cuda_status != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); + std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); } TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda); cuda_status = cudaFree(out_ptr); @@ -262,7 +263,7 @@ kernel_conv_padding<<>>(static_cast(out_ptr), static cuda_status = cudaGetLastError(); if (cuda_status != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); + std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); } TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda); cuda_status = cudaFree(out_ptr); @@ -390,7 +391,7 @@ kernel_col2im<<>>(static_cast(out_ptr), static_cast< cuda_status = cudaGetLastError(); if (cuda_status != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); + std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); } TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda); cuda_status = cudaFree(out_ptr); @@ -514,7 +515,7 @@ kernel_im2col<<>>(static_cast(out_ptr), static_cast< cuda_status = cudaGetLastError(); if (cuda_status != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); + std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); } TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda); cuda_status = cudaFree(out_ptr); diff --git a/src/tensor_array/core/tensor.cu b/src/tensor-array/core/tensor_math_func.cu similarity index 62% rename from src/tensor_array/core/tensor.cu rename to src/tensor-array/core/tensor_math_func.cu index 9db96a8..5ccef02 100644 --- a/src/tensor_array/core/tensor.cu +++ b/src/tensor-array/core/tensor_math_func.cu @@ -28,15 +28,16 @@ limitations under the License. #ifndef TENSOR_CONTENT #define TENSOR_CONTENT #include "tensor.hh" +#include "tensor_math_op.hh" #undef TENSOR_CONTENT #endif // !TENSOR_CONTENT -#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3) -#define USING_DATA_TYPE_NVIDIA_FLOAT (__half)(__nv_bfloat16) -#define USING_DATA_TYPE_FLOAT (float)(double) -#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t) -#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t) -#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT USING_DATA_TYPE_NVIDIA_FLOAT +#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3) +#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16) +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() USING_DATA_TYPE_NVIDIA_FLOAT() #define LOOP(seq) END(A seq) #define BODY(x) ADD_CODE(x) @@ -70,30 +71,6 @@ namespace tensor_array value_arr[thread_x] = curand_uniform(&thisState); } - template - __global__ void sum_2_arr(T c[], const T a[], const T b[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - c[thread_x] = a[thread_x] + b[thread_x]; - } - - template - __global__ void mul_2_arr(T c[], const T a[], const T b[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - c[thread_x] = a[thread_x] * b[thread_x]; - } - - template - __global__ void div_2_arr(T c[], const T a[], const T b[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - c[thread_x] = a[thread_x] / b[thread_x]; - } - __global__ void exp_arr(float value_out[], const float value_in[], unsigned int c_size) { unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; @@ -206,14 +183,6 @@ namespace tensor_array value_out[thread_x] = tanh(value_in[thread_x]); } - template - __global__ void sigmoid_arr(T value_out[], const T value_in[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - value_out[thread_x] = T(1) / (T(1) + T(exp(double(-value_in[thread_x])))); - } - template __global__ void pow_arr(T value_out[], const T a[], const T b[], unsigned int c_size) { @@ -225,45 +194,6 @@ namespace tensor_array value_out[thread_x] = powf(a[thread_x], b[thread_x]); } - template - __global__ void arr_more_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - out_value[thread_x] = in1_value[thread_x] > in2_value[thread_x]; - } - - template - __global__ void arr_less_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - out_value[thread_x] = in1_value[thread_x] < in2_value[thread_x]; - } - - __global__ void arr_logical_and(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - out_value[thread_x] = in1_value[thread_x] && in2_value[thread_x]; - } - - - __global__ void arr_logical_or(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - out_value[thread_x] = in1_value[thread_x] || in2_value[thread_x]; - } - - - __global__ void arr_logical_not(bool out_value[], const bool in1_value[], unsigned int c_size) - { - unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_x < c_size) - out_value[thread_x] = !in1_value[thread_x]; - } - template __global__ void array_condition(T out_value[], unsigned int c_size, const bool bool_value[], const T true_value[], const T false_value[]) { @@ -323,7 +253,7 @@ namespace tensor_array cudaStat = cudaGetLastError(); if (cudaStat != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(cudaStat)); + std::printf("CUDA error: %s\n", cudaGetErrorString(cudaStat)); } TensorBase value_buf(a.get_buffer().type(), { shape_a.begin()[0], shape_a.begin()[2], shape_a.begin()[1], shape_a.end()[-1]}, c_ptr, this_cuda); cudaStat = cudaFree(c_ptr); @@ -381,197 +311,13 @@ return values0(list_dim, value); \ cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(cudaStatus)); + std::printf("CUDA error: %s\n", cudaGetErrorString(cudaStatus)); } TensorBase other_buf(typeid(float), list_dim, dev_ptr, this_cuda); cudaStatus = cudaFree(dev_ptr); return other_buf; } - Tensor operator>(const Tensor& a, const Tensor& b) - { - assert(equal_dim_size(a.get_buffer(), b.get_buffer())); - cudaError cuda_status; - bool* c_ptr; - Device this_cuda{ CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_a = a.get_buffer().change_device(this_cuda); - TensorBase base_b = b.get_buffer().change_device(this_cuda); - std::size_t c_size = std::max - ( - base_a.data_size() / get_sizeof_type(base_a.type()), - base_b.data_size() / get_sizeof_type(base_b.type()) - ); - cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); -#define ADD_CODE(TYPE) \ -if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ -arr_more_than<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); - LOOP(USING_DATA_TYPE); -#undef ADD_CODE - cuda_status = cudaDeviceSynchronize(); - cuda_status = cudaGetLastError(); - if (cuda_status != cudaSuccess) - { - printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); - } - TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); - cuda_status = cudaFree(c_ptr); - return other_buf; - } - - Tensor operator<(const Tensor& a, const Tensor& b) - { - assert(equal_dim_size(a.get_buffer(), b.get_buffer())); - cudaError cuda_status; - bool* c_ptr; - Device this_cuda{CUDA}; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_a = a.get_buffer().change_device(this_cuda); - TensorBase base_b = b.get_buffer().change_device(this_cuda); - std::size_t c_size = std::max - ( - a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), - b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) - ); - cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); -#define ADD_CODE(TYPE) \ -if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ -arr_less_than<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); - LOOP(USING_DATA_TYPE); -#undef ADD_CODE - cuda_status = cudaDeviceSynchronize(); - TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); - cuda_status = cudaFree(c_ptr); - return other_buf; - } - - Tensor operator&&(const Tensor& a, const Tensor& b) - { - assert( - equal_dim_size(a.get_buffer(), b.get_buffer()) - && a.get_buffer().type() == typeid(bool) - && b.get_buffer().type() == typeid(bool) - ); - cudaError cuda_status; - bool* c_ptr; - devices::Device this_cuda{ devices::CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_a = a.get_buffer().change_device(this_cuda); - TensorBase base_b = b.get_buffer().change_device(this_cuda); - std::size_t c_size = std::max - ( - a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), - b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) - ); - cuda_status = cudaMalloc(&c_ptr, c_size); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); - arr_logical_and << > > (c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); - cuda_status = cudaDeviceSynchronize(); - TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); - cuda_status = cudaFree(c_ptr); - return other_buf; - } - - Tensor operator||(const Tensor& a, const Tensor& b) - { - assert( - equal_dim_size(a.get_buffer(), b.get_buffer()) - && a.get_buffer().type() == typeid(bool) - && b.get_buffer().type() == typeid(bool) - ); - cudaError cuda_status; - bool* c_ptr; - devices::Device this_cuda{ devices::CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_a = a.get_buffer().change_device(this_cuda); - TensorBase base_b = b.get_buffer().change_device(this_cuda); - std::size_t c_size = std::max - ( - a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), - b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) - ); - cuda_status = cudaMalloc(&c_ptr, c_size); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); - arr_logical_or << > > (c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); - cuda_status = cudaDeviceSynchronize(); - TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); - cuda_status = cudaFree(c_ptr); - return other_buf; - } - - Tensor Tensor::operator!() - { - assert(this->get_buffer().type() == typeid(bool)); - cudaError cuda_status; - bool* out_ptr; - devices::Device this_cuda{ devices::CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_of_this = this->get_buffer().change_device(this_cuda); - cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size()); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(this->get_buffer().data_size() / block_dim.x + 1U); - arr_logical_not << < grid_dim, block_dim >> > (out_ptr, static_cast(base_of_this.data()), this->get_buffer().data_size()); - cuda_status = cudaDeviceSynchronize(); - TensorBase other_buf(typeid(bool), this->get_buffer().shape(), out_ptr, this_cuda); - cuda_status = cudaFree(out_ptr); - return other_buf; - } - - Tensor multiply(const Tensor& a, const Tensor& b, bool is_derive, const DataBuffer&) - { - assert(equal_dim_size(a.get_buffer(), b.get_buffer())); - std::vector> temp; - if (is_derive) - { - temp.push_back(std::make_pair(a, Derivation(b.clone(), multiply))); - temp.push_back(std::make_pair(b, Derivation(a.clone(), multiply))); - } - cudaError cuda_status; - TensorBase other_buf; - void* c_ptr; - devices::Device this_cuda{ devices::CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_a = a.get_buffer().change_device(this_cuda); - TensorBase base_b = b.get_buffer().change_device(this_cuda); - std::size_t c_size = std::max - ( - a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), - b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) - ); - cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size())); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); -#define ADD_CODE(TYPE) \ -if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ -{ \ -mul_2_arr<<>>(static_cast(c_ptr), static_cast(base_a.data()), static_cast(base_b.data()), c_size); \ -cuda_status = cudaDeviceSynchronize(); \ -other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \ -} - LOOP(USING_DATA_TYPE); -#undef ADD_CODE - cuda_status = cudaFree(c_ptr); - return Tensor(std::move(other_buf), std::move(temp)); - } - Tensor condition(const Tensor& bool_value, const Tensor& true_value, const Tensor& false_value, bool is_derive) { assert( @@ -615,84 +361,6 @@ other_buf = TensorBase(typeid(TYPE), bool_value.get_buffer().shape(), ptr_out, t return Tensor(std::move(other_buf), std::move(temp)); } - Tensor add(const Tensor& a, const Tensor& b, bool is_derive) - { - assert(equal_dim_size(a.get_buffer(), b.get_buffer())); - std::vector> temp; - if (is_derive) - { - temp.push_back(std::make_pair(a, Derivation(values(a.get_buffer().shape(), 1).tensor_cast(a.get_buffer().type(), false), multiply))); - temp.push_back(std::make_pair(b, Derivation(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), multiply))); - } - cudaError cuda_status; - TensorBase other_buf; - void* c_ptr; - devices::Device this_cuda{ devices::CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_a = a.get_buffer().change_device(this_cuda); - TensorBase base_b = b.get_buffer().change_device(this_cuda); - std::size_t c_size = std::max - ( - a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), - b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) - ); - cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size())); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); -#define ADD_CODE(TYPE) \ -if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ -{ \ -sum_2_arr<<>>(static_cast(c_ptr), static_cast(base_a.data()), static_cast(base_b.data()), c_size); \ -cuda_status = cudaDeviceSynchronize(); \ -other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \ -} - LOOP(USING_DATA_TYPE); -#undef ADD_CODE - cuda_status = cudaFree(c_ptr); - return Tensor(std::move(other_buf), std::move(temp)); - } - - Tensor divide(const Tensor& a, const Tensor& b, bool is_derive) - { - assert(equal_dim_size(a.get_buffer(), b.get_buffer())); - std::vector> temp; - if (is_derive) - { - temp.push_back(std::make_pair(a, Derivation(divide(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), b, false), multiply))); - temp.push_back(std::make_pair(b, Derivation(divide(a, power(b, values(b.get_buffer().shape(), 2).tensor_cast(b.get_buffer().type(), false), false), false), multiply))); - } - cudaError cuda_status; - TensorBase other_buf; - void* c_ptr; - devices::Device this_cuda{ devices::CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_a = a.get_buffer().change_device(this_cuda); - TensorBase base_b = b.get_buffer().change_device(this_cuda); - std::size_t c_size = std::max - ( - a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), - b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) - ); - cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size())); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); -#define ADD_CODE(TYPE) \ -if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ -{ \ -div_2_arr<<>>(static_cast(c_ptr), static_cast(base_a.data()), static_cast(base_b.data()), c_size); \ -cuda_status = cudaDeviceSynchronize(); \ -other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \ -} - LOOP(USING_DATA_TYPE); -#undef ADD_CODE - cuda_status = cudaFree(c_ptr); - return Tensor(std::move(other_buf), std::move(temp)); - } - Tensor power(const Tensor& a, const Tensor& b, bool is_derive) { assert(equal_dim_size(a.get_buffer(), b.get_buffer())); @@ -956,39 +624,6 @@ tanh_arr<<>>(static_cast(out_ptr), static_cast> temp; - if (is_derive) - { - Tensor temp_ones = values(this->get_buffer().shape(), 1.f).tensor_cast(this->get_buffer().type(), false); - Tensor temp_sigmoid = this->sigmoid(false); - temp.push_back(std::make_pair(*this, Derivation(multiply(temp_sigmoid, add(temp_ones, -temp_sigmoid, false), false, DataBuffer()), multiply))); - } - cudaError cuda_status; - void* out_ptr; - devices::Device this_cuda{ devices::CUDA }; - cuda_status = cudaGetDevice(&this_cuda.index); - cudaDeviceProp cu_dev_prop; - cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); - TensorBase base_of_this = this->get_buffer().change_device(this_cuda); - cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size()); - dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); - std::size_t out_size = this->get_buffer().data_size() / get_sizeof_type(this->get_buffer().type()); - dim3 grid_dim(out_size / block_dim.x + ((out_size % block_dim.x) ? 1U : 0U)); -#define ADD_CODE(TYPE) \ -if(this->get_buffer().type() == typeid(TYPE)) \ -sigmoid_arr<<>>(static_cast(out_ptr), static_cast(base_of_this.data()), out_size); - LOOP(USING_DATA_TYPE); -#undef ADD_CODE - cuda_status = cudaDeviceSynchronize(); - TensorBase other_buf(this->get_buffer().type(), this->get_buffer().shape(), out_ptr, this_cuda); - cuda_status = cudaFree(out_ptr); - return Tensor(std::move(other_buf), std::move(temp)); - } - - } } diff --git a/src/tensor-array/core/tensor_math_op.cu b/src/tensor-array/core/tensor_math_op.cu new file mode 100644 index 0000000..2514095 --- /dev/null +++ b/src/tensor-array/core/tensor_math_op.cu @@ -0,0 +1,602 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef TENSOR_CONTENT +#define TENSOR_CONTENT +#include "tensor.hh" +#include "tensor_math_op.hh" +#undef TENSOR_CONTENT +#endif // !TENSOR_CONTENT + +#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3) +#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16) +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#if CUDART_VERSION >= 12020 +#define USING_DATA_TYPE \ +USING_DATA_TYPE_SINT() \ +USING_DATA_TYPE_UINT() \ +USING_DATA_TYPE_FLOAT() \ +USING_DATA_TYPE_NVIDIA_FLOAT() +#else +#define USING_DATA_TYPE \ +USING_DATA_TYPE_SINT() \ +USING_DATA_TYPE_UINT() \ +USING_DATA_TYPE_FLOAT() +#endif + +#define LOOP(seq) END(A seq) +#define BODY(x) ADD_CODE(x) +#define A(x) BODY(x) B +#define B(x) BODY(x) A +#define A_END +#define B_END +#define END(...) END_(__VA_ARGS__) +#define END_(...) __VA_ARGS__##_END + +namespace tensor_array +{ + namespace value + { + using namespace devices; + + template + __global__ void sum_2_arr(T c[], const T a[], const T b[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + c[thread_x] = a[thread_x] + b[thread_x]; + } + + template + __global__ void mul_2_arr(T c[], const T a[], const T b[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + c[thread_x] = a[thread_x] * b[thread_x]; + } + + template + __global__ void div_2_arr(T c[], const T a[], const T b[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + c[thread_x] = a[thread_x] / b[thread_x]; + } + + template + __global__ void arr_greater_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] > in2_value[thread_x]; + } + + template + __global__ void arr_greater_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] >= in2_value[thread_x]; + } + + template + __global__ void arr_less_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] < in2_value[thread_x]; + } + + template + __global__ void arr_less_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] <= in2_value[thread_x]; + } + + template + __global__ void arr_equal_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] == in2_value[thread_x]; + } + + template + __global__ void arr_not_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] != in2_value[thread_x]; + } + + __global__ void arr_logical_and(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] && in2_value[thread_x]; + } + + + __global__ void arr_logical_or(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = in1_value[thread_x] || in2_value[thread_x]; + } + + + __global__ void arr_logical_not(bool out_value[], const bool in1_value[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + out_value[thread_x] = !in1_value[thread_x]; + } + + template + __global__ void sigmoid_arr(T value_out[], const T value_in[], unsigned int c_size) + { + unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_x < c_size) + value_out[thread_x] = T(1) / (T(1) + T(exp(double(-value_in[thread_x])))); + } + + bool equal_dim_size(const TensorBase& a, const TensorBase& b); + + Tensor operator>(const Tensor& a, const Tensor& b) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + cudaError cuda_status; + bool* c_ptr; + Device this_cuda{ CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + base_a.data_size() / get_sizeof_type(base_a.type()), + base_b.data_size() / get_sizeof_type(base_b.type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +arr_greater_than<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaDeviceSynchronize(); + cuda_status = cudaGetLastError(); + if (cuda_status != cudaSuccess) + { + std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); + } + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor operator>=(const Tensor& a, const Tensor& b) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + cudaError cuda_status; + bool* c_ptr; + Device this_cuda{ CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + base_a.data_size() / get_sizeof_type(base_a.type()), + base_b.data_size() / get_sizeof_type(base_b.type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +arr_greater_equal<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaDeviceSynchronize(); + cuda_status = cudaGetLastError(); + if (cuda_status != cudaSuccess) + { + std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status)); + } + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor operator<(const Tensor& a, const Tensor& b) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + cudaError cuda_status; + bool* c_ptr; + Device this_cuda{CUDA}; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +arr_less_than<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor operator<=(const Tensor& a, const Tensor& b) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + cudaError cuda_status; + bool* c_ptr; + Device this_cuda{CUDA}; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +arr_less_equal<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor operator==(const Tensor& a, const Tensor& b) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + cudaError cuda_status; + bool* c_ptr; + Device this_cuda{CUDA}; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +arr_equal_equal<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor operator!=(const Tensor& a, const Tensor& b) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + cudaError cuda_status; + bool* c_ptr; + Device this_cuda{CUDA}; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool)); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +arr_not_equal<<>>(c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor operator&&(const Tensor& a, const Tensor& b) + { + assert( + equal_dim_size(a.get_buffer(), b.get_buffer()) + && a.get_buffer().type() == typeid(bool) + && b.get_buffer().type() == typeid(bool) + ); + cudaError cuda_status; + bool* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); + arr_logical_and << > > (c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor operator||(const Tensor& a, const Tensor& b) + { + assert( + equal_dim_size(a.get_buffer(), b.get_buffer()) + && a.get_buffer().type() == typeid(bool) + && b.get_buffer().type() == typeid(bool) + ); + cudaError cuda_status; + bool* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, c_size); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); + arr_logical_or << > > (c_ptr, static_cast(base_a.data()), static_cast(base_b.data()), c_size); + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda); + cuda_status = cudaFree(c_ptr); + return other_buf; + } + + Tensor Tensor::operator!() + { + assert(this->get_buffer().type() == typeid(bool)); + cudaError cuda_status; + bool* out_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_of_this = this->get_buffer().change_device(this_cuda); + cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size()); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(this->get_buffer().data_size() / block_dim.x + 1U); + arr_logical_not << < grid_dim, block_dim >> > (out_ptr, static_cast(base_of_this.data()), this->get_buffer().data_size()); + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(typeid(bool), this->get_buffer().shape(), out_ptr, this_cuda); + cuda_status = cudaFree(out_ptr); + return other_buf; + } + + Tensor multiply(const Tensor& a, const Tensor& b, bool is_derive, const DataBuffer&) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + std::vector> temp; + if (is_derive) + { + temp.push_back(std::make_pair(a, Derivation(b.clone(), multiply))); + temp.push_back(std::make_pair(b, Derivation(a.clone(), multiply))); + } + cudaError cuda_status; + TensorBase other_buf; + void* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size())); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +{ \ +mul_2_arr<<>>(static_cast(c_ptr), static_cast(base_a.data()), static_cast(base_b.data()), c_size); \ +cuda_status = cudaDeviceSynchronize(); \ +other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \ +} + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaFree(c_ptr); + return Tensor(std::move(other_buf), std::move(temp)); + } + + Tensor add(const Tensor& a, const Tensor& b, bool is_derive) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + std::vector> temp; + if (is_derive) + { + temp.push_back(std::make_pair(a, Derivation(values(a.get_buffer().shape(), 1).tensor_cast(a.get_buffer().type(), false), multiply))); + temp.push_back(std::make_pair(b, Derivation(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), multiply))); + } + cudaError cuda_status; + TensorBase other_buf; + void* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size())); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +{ \ +sum_2_arr<<>>(static_cast(c_ptr), static_cast(base_a.data()), static_cast(base_b.data()), c_size); \ +cuda_status = cudaDeviceSynchronize(); \ +other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \ +} + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaFree(c_ptr); + return Tensor(std::move(other_buf), std::move(temp)); + } + + Tensor divide(const Tensor& a, const Tensor& b, bool is_derive) + { + assert(equal_dim_size(a.get_buffer(), b.get_buffer())); + std::vector> temp; + if (is_derive) + { + temp.push_back(std::make_pair(a, Derivation(divide(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), b, false), multiply))); + temp.push_back(std::make_pair(b, Derivation(divide(a, power(b, values(b.get_buffer().shape(), 2).tensor_cast(b.get_buffer().type(), false), false), false), multiply))); + } + cudaError cuda_status; + TensorBase other_buf; + void* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_a = a.get_buffer().change_device(this_cuda); + TensorBase base_b = b.get_buffer().change_device(this_cuda); + std::size_t c_size = std::max + ( + a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()), + b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type()) + ); + cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size())); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \ +{ \ +div_2_arr<<>>(static_cast(c_ptr), static_cast(base_a.data()), static_cast(base_b.data()), c_size); \ +cuda_status = cudaDeviceSynchronize(); \ +other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \ +} + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaFree(c_ptr); + return Tensor(std::move(other_buf), std::move(temp)); + } + + Tensor Tensor::sigmoid(bool is_derive) const + { + std::vector> temp; + if (is_derive) + { + Tensor temp_ones = values(this->get_buffer().shape(), 1.f).tensor_cast(this->get_buffer().type(), false); + Tensor temp_sigmoid = this->sigmoid(false); + temp.push_back(std::make_pair(*this, Derivation(multiply(temp_sigmoid, add(temp_ones, -temp_sigmoid, false), false, DataBuffer()), multiply))); + } + cudaError cuda_status; + void* out_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + TensorBase base_of_this = this->get_buffer().change_device(this_cuda); + cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size()); + dim3 block_dim(cu_dev_prop.maxThreadsDim[0]); + std::size_t out_size = this->get_buffer().data_size() / get_sizeof_type(this->get_buffer().type()); + dim3 grid_dim(out_size / block_dim.x + ((out_size % block_dim.x) ? 1U : 0U)); +#define ADD_CODE(TYPE) \ +if(this->get_buffer().type() == typeid(TYPE)) \ +sigmoid_arr<<>>(static_cast(out_ptr), static_cast(base_of_this.data()), out_size); + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaDeviceSynchronize(); + TensorBase other_buf(this->get_buffer().type(), this->get_buffer().shape(), out_ptr, this_cuda); + cuda_status = cudaFree(out_ptr); + return Tensor(std::move(other_buf), std::move(temp)); + } + } +} + +#undef LOOP +#undef BODY +#undef A +#undef B +#undef A_END +#undef B_END +#undef END +#undef END_ + +#undef USING_DATA_TYPE diff --git a/src/tensor-array/core/tensor_math_op.hh b/src/tensor-array/core/tensor_math_op.hh new file mode 100644 index 0000000..1d3c33e --- /dev/null +++ b/src/tensor-array/core/tensor_math_op.hh @@ -0,0 +1,27 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef TENSOR_CONTENT +namespace tensor_array +{ + namespace value + { + Tensor multiply(const Tensor&, const Tensor&, bool, const DataBuffer&); + Tensor add(const Tensor&, const Tensor&, bool); + Tensor divide(const Tensor&, const Tensor&, bool); + } +} +#endif diff --git a/src/tensor-array/core/tensor_reduce.cu b/src/tensor-array/core/tensor_reduce.cu new file mode 100644 index 0000000..5b6ddfd --- /dev/null +++ b/src/tensor-array/core/tensor_reduce.cu @@ -0,0 +1,368 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef TENSOR_CONTENT +#define TENSOR_CONTENT +#include "tensor.hh" +#undef TENSOR_CONTENT +#endif + +#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3) +#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16) +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() + +#define LOOP(seq) END(A seq) +#define BODY(x) ADD_CODE(x) +#define A(x) BODY(x) B +#define B(x) BODY(x) A +#define A_END +#define B_END +#define END(...) END_(__VA_ARGS__) +#define END_(...) __VA_ARGS__##_END + + +namespace tensor_array +{ + namespace value + { + using namespace devices; + + template + __device__ void warp_reduce_sum(T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int value) + { + (*sdata)[threadIdx.x][threadIdx.z][threadIdx.y] += (*sdata)[threadIdx.x][threadIdx.z + value][threadIdx.y]; + } + + template + __device__ void warp_reduce_functions + ( + void(*func)(T (*)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int), + T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize], + Args ... args + ) + { + if constexpr (DimBlockSize >= 1024) if (threadIdx.z < 512) func(sdata, 512, args...); + if constexpr (DimBlockSize >= 512) if (threadIdx.z < 256) func(sdata, 256, args...); + if constexpr (DimBlockSize >= 256) if (threadIdx.z < 128) func(sdata, 128, args...); + if constexpr (DimBlockSize >= 128) if (threadIdx.z < 64) func(sdata, 64, args...); + if constexpr (DimBlockSize >= 64) if (threadIdx.z < 32) func(sdata, 32, args...); + if constexpr (DimBlockSize >= 32) if (threadIdx.z < 16) func(sdata, 16, args...); + if constexpr (DimBlockSize >= 16) if (threadIdx.z < 8) func(sdata, 8, args...); + if constexpr (DimBlockSize >= 8) if (threadIdx.z < 4) func(sdata, 4, args...); + if constexpr (DimBlockSize >= 4) if (threadIdx.z < 2) func(sdata, 2, args...); + if constexpr (DimBlockSize >= 2) if (threadIdx.z < 1) func(sdata, 1, args...); + } + + template + __device__ void warp_reduce_max(T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int value, unsigned int (*sindex)[BatchBlockSize][DimBlockSize][ContentBlockSize]) + { + if (sdata[threadIdx.x][threadIdx.z][threadIdx.y] < sdata[threadIdx.x][threadIdx.z + value][threadIdx.y]) + { + sdata[threadIdx.x][threadIdx.z + value][threadIdx.y] = sdata[threadIdx.x][threadIdx.z + value][threadIdx.y]; + sindex[threadIdx.x][threadIdx.z + value][threadIdx.y] = sindex[threadIdx.x][threadIdx.z + value][threadIdx.y]; + } + } + + template + __device__ void warp_reduce_min(T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int value, unsigned int (*sindex)[BatchBlockSize][DimBlockSize][ContentBlockSize]) + { + if (sdata[threadIdx.x][threadIdx.z][threadIdx.y] > sdata[threadIdx.x][threadIdx.z + value][threadIdx.y]) + { + sdata[threadIdx.x][threadIdx.z][threadIdx.y] = sdata[threadIdx.x][threadIdx.z + value][threadIdx.y]; + sindex[threadIdx.x][threadIdx.z][threadIdx.y] = sindex[threadIdx.x][threadIdx.z + value][threadIdx.y]; + } + } + + template + __global__ void array_reduce_sum(T *g_odata, const T *g_idata, unsigned int batch_size, unsigned int n, unsigned int content_size) + { + __shared__ T sdata[BatchBlockSize][BlockSize][ContentBlockSize]; + unsigned int batch_id = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int content_id = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int tid = threadIdx.z; + unsigned int gridSize = blockDim.z * gridDim.z; + sdata[threadIdx.x][threadIdx.z][threadIdx.y] = 0; + if (batch_id < batch_size && blockIdx.z * blockDim.z + tid < n && content_id < content_size) + sdata[threadIdx.x][threadIdx.z][threadIdx.y] += + g_idata + [ + batch_id * n * content_size + + tid * content_size + + content_id + ]; + __syncthreads(); + if (tid < 512) + warp_reduce_functions(&warp_reduce_sum, &sdata); + if (tid == 0) + g_odata[ + batch_id * blockDim.z * content_size + + tid * content_size + + content_id + ] = sdata[threadIdx.x][threadIdx.z][threadIdx.y]; + } + + template + __global__ void array_reduce_max(T *g_odata, unsigned int *g_oindex, const T *g_idata, unsigned int batch_size, unsigned int n, unsigned int content_size) + { + __shared__ T sdata[BatchBlockSize][BlockSize][ContentBlockSize]; + __shared__ unsigned int sindex[BatchBlockSize][BlockSize][ContentBlockSize]; + unsigned int batch_id = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int content_id = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int tid = threadIdx.z; + unsigned int gridSize = blockDim.z * gridDim.z; + sdata[threadIdx.x][threadIdx.z][threadIdx.y] = -std::numeric_limits::infinity(); + sindex[threadIdx.x][threadIdx.z][threadIdx.y] = threadIdx.z; + if (batch_id < batch_size && blockIdx.z * blockDim.z + tid < n && content_id < content_size) + sdata[threadIdx.x][threadIdx.z][threadIdx.y] = + g_idata + [ + batch_id * n * content_size + + tid * content_size + + content_id + ]; + __syncthreads(); + if (tid < 512) + warp_reduce_functions(&warp_reduce_max, &sdata, &sindex); + if (tid == 0) + { + g_odata[ + batch_id * blockDim.z * content_size + + blockIdx.z * content_size + + content_id + ] = sdata[threadIdx.x][threadIdx.z][threadIdx.y]; + g_oindex[ + batch_id * blockDim.z * content_size + + blockIdx.z * content_size + + content_id + ] = sindex[threadIdx.x][threadIdx.z][threadIdx.y]; + } + } + + template + __global__ void array_reduce_min(T *g_odata, unsigned int *g_oindex, const T *g_idata, unsigned int batch_size, unsigned int n, unsigned int content_size) + { + __shared__ T sdata[BatchBlockSize][BlockSize][ContentBlockSize]; + __shared__ unsigned int sindex[BatchBlockSize][BlockSize][ContentBlockSize]; + unsigned int batch_id = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int content_id = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int tid = threadIdx.z; + unsigned int gridSize = blockDim.z * gridDim.z; + sdata[threadIdx.x][threadIdx.z][threadIdx.y] = std::numeric_limits::infinity(); + sindex[threadIdx.x][threadIdx.z][threadIdx.y] = threadIdx.z; + if (batch_id < batch_size && blockIdx.z * blockDim.z + tid < n && content_id < content_size) + sdata[threadIdx.x][threadIdx.z][threadIdx.y] = + g_idata + [ + batch_id * n * content_size + + tid * content_size + + content_id + ]; + __syncthreads(); + if (tid < 512) + warp_reduce_functions(&warp_reduce_min, &sdata, &sindex); + if (tid == 0) + { + g_odata[ + batch_id * blockDim.z * content_size + + blockIdx.z * content_size + + content_id + ] = sdata[threadIdx.x][threadIdx.z][threadIdx.y]; + g_oindex[ + batch_id * blockDim.z * content_size + + blockIdx.z * content_size + + content_id + ] = sindex[threadIdx.x][threadIdx.z][threadIdx.y]; + } + } + + bool equal_dim_size(const TensorBase&, const TensorBase&); + Tensor derive_reduce_sum(const Tensor& a, const Tensor& b, bool is_derive, const DataBuffer& databuf) + { + return multiply(a, b, is_derive, databuf); + } + + Tensor Tensor::reduce_sum(unsigned char dim) const + { + std::vector shape_c = this->get_buffer().shape(); + assert(dim < shape_c.size()); + std::vector> temp; + temp.push_back(std::make_pair(*this, Derivation(values(shape_c, 1).tensor_cast(this->get_buffer().type(), false), derive_reduce_sum))); + cudaError_t cuda_status; + TensorBase other_buf; + + void* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + const TensorBase& base_a = this->get_buffer(); + cuda_status = cudaMalloc(&c_ptr, base_a.data_size()); + device_memcpy(&c_ptr, this_cuda, base_a.data(), base_a.get_device(), base_a.data_size()); + + unsigned int dim_x = 1; + for (unsigned char i = 0; i < dim; i++) + dim_x *= shape_c[i]; + + unsigned int dim_y = 1; + for (unsigned char i = dim+1; i < shape_c.size(); i++) + dim_y *= shape_c[i]; + + constexpr unsigned int thread_value_x = 8U; + constexpr unsigned int thread_value_y = 16U; + constexpr unsigned int thread_value_z = 8U; + dim3 block_dim(thread_value_x, thread_value_y, thread_value_z); + dim3 grid_dim + ( + dim_x / block_dim.x + (dim_x % block_dim.x ? 1U : 0U), + dim_y / block_dim.y + (dim_y % block_dim.y ? 1U : 0U), + shape_c[dim] / block_dim.z + (shape_c[dim] % block_dim.z ? 1U : 0U) + ); +#define ADD_CODE(TYPE) \ +if(base_a.type() == typeid(TYPE)) \ +{ \ +while (shape_c[dim] > 1) \ +{ \ +array_reduce_sum<<>>(static_cast(c_ptr), static_cast(c_ptr), dim_x, shape_c[dim], dim_y); \ +cuda_status = cudaDeviceSynchronize(); \ +shape_c[dim] = grid_dim.z; \ +grid_dim.z = grid_dim.z / block_dim.z + (grid_dim.z % block_dim.z ? 1U : 0U); \ +} \ +other_buf = TensorBase(typeid(TYPE), shape_c, c_ptr, this_cuda); \ +} + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + cuda_status = cudaFree(c_ptr); + return Tensor(std::move(other_buf)); + } + + Tensor Tensor::reduce_max(unsigned char dim) const + { + std::vector shape_c = this->get_buffer().shape(); + assert(dim < shape_c.size()); + cudaError_t cuda_status; + TensorBase other_buf; + void* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + const TensorBase& base_a = this->get_buffer(); + cuda_status = cudaMalloc(&c_ptr, base_a.data_size()); + device_memcpy(&c_ptr, this_cuda, base_a.data(), base_a.get_device(), base_a.data_size()); + + unsigned int dim_x = 1; + for (unsigned char i = 0; i < dim; i++) + dim_x *= shape_c[i]; + + unsigned int dim_y = 1; + for (unsigned char i = dim+1; i < shape_c.size(); i++) + dim_y *= shape_c[i]; + + constexpr unsigned int thread_value_x = 8U; + constexpr unsigned int thread_value_y = 16U; + constexpr unsigned int thread_value_z = 8U; + dim3 block_dim(thread_value_x, thread_value_y, thread_value_z); + dim3 grid_dim + ( + dim_x / block_dim.x + (dim_x % block_dim.x ? 1U : 0U), + dim_y / block_dim.y + (dim_y % block_dim.y ? 1U : 0U), + shape_c[dim] / block_dim.z + (shape_c[dim] % block_dim.z ? 1U : 0U) + ); +#define ADD_CODE(TYPE) \ +if(base_a.type() == typeid(TYPE)) \ +{ \ +while (shape_c[dim] > 1) \ +{ \ +array_reduce_sum<<>>(static_cast(c_ptr), static_cast(c_ptr), dim_x, shape_c[dim], dim_y); \ +cuda_status = cudaDeviceSynchronize(); \ +shape_c[dim] = grid_dim.z; \ +grid_dim.z = grid_dim.z / block_dim.z + (grid_dim.z % block_dim.z ? 1U : 0U); \ +} \ +other_buf = TensorBase(typeid(TYPE), base_a.shape(), c_ptr, this_cuda); \ +} + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + std::vector> temp; + temp.push_back(std::make_pair(*this, Derivation(values(this->get_buffer().shape(), 1).tensor_cast(this->get_buffer().type(), false), derive_reduce_sum))); + cuda_status = cudaFree(c_ptr); + return Tensor(std::move(other_buf)); + } + + Tensor Tensor::reduce_min(unsigned char dim) const + { + std::vector shape_c = this->get_buffer().shape(); + assert(dim < shape_c.size()); + cudaError_t cuda_status; + TensorBase other_buf; + void* c_ptr; + devices::Device this_cuda{ devices::CUDA }; + cuda_status = cudaGetDevice(&this_cuda.index); + cudaDeviceProp cu_dev_prop; + cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index); + const TensorBase& base_a = this->get_buffer(); + cuda_status = cudaMalloc(&c_ptr, base_a.data_size()); + device_memcpy(&c_ptr, this_cuda, base_a.data(), base_a.get_device(), base_a.data_size()); + unsigned int dim_x = 1; + for (unsigned char i = 0; i < dim; i++) + dim_x *= shape_c[i]; + + unsigned int dim_y = 1; + for (unsigned char i = dim+1; i < shape_c.size(); i++) + dim_y *= shape_c[i]; + + constexpr unsigned int thread_value_x = 8U; + constexpr unsigned int thread_value_y = 16U; + constexpr unsigned int thread_value_z = 8U; + dim3 block_dim(thread_value_x, thread_value_y, thread_value_z); + dim3 grid_dim + ( + dim_x / block_dim.x + (dim_x % block_dim.x ? 1U : 0U), + dim_y / block_dim.y + (dim_y % block_dim.y ? 1U : 0U), + shape_c[dim] / block_dim.z + (shape_c[dim] % block_dim.z ? 1U : 0U) + ); +#define ADD_CODE(TYPE) \ +if(base_a.type() == typeid(TYPE)) \ +{ \ +while (shape_c[dim] > 1) \ +{ \ +array_reduce_sum<<>>(static_cast(c_ptr), static_cast(c_ptr), dim_x, shape_c[dim], dim_y); \ +cuda_status = cudaDeviceSynchronize(); \ +shape_c[dim] = grid_dim.z; \ +grid_dim.z = grid_dim.z / block_dim.z + (grid_dim.z % block_dim.z ? 1U : 0U); \ +} \ +other_buf = TensorBase(typeid(TYPE), base_a.shape(), c_ptr, this_cuda); \ +} + LOOP(USING_DATA_TYPE); +#undef ADD_CODE + std::vector> temp; + temp.push_back(std::make_pair(*this, Derivation(values(this->get_buffer().shape(), 1).tensor_cast(this->get_buffer().type(), false), derive_reduce_sum))); + cuda_status = cudaFree(c_ptr); + return Tensor(std::move(other_buf)); + } + } +} diff --git a/src/tensor_array/core/tensorarray.hh b/src/tensor-array/core/tensorarray.hh similarity index 100% rename from src/tensor_array/core/tensorarray.hh rename to src/tensor-array/core/tensorarray.hh diff --git a/src/tensor_array/core/tensorbase.cc b/src/tensor-array/core/tensorbase.cc similarity index 96% rename from src/tensor_array/core/tensorbase.cc rename to src/tensor-array/core/tensorbase.cc index 55c5398..d3cc7a7 100644 --- a/src/tensor_array/core/tensorbase.cc +++ b/src/tensor-array/core/tensorbase.cc @@ -23,10 +23,10 @@ limitations under the License. #include #include -#define USING_DATA_TYPE_FLOAT (float)(double) -#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t) -#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t) -#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT +#define USING_DATA_TYPE_FLOAT() (float)(double) +#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t) +#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t) +#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() #define LOOP(seq) END(A seq) #define BODY(x) ADD_CODE(x) diff --git a/src/tensor_array/core/tensorbase.hh b/src/tensor-array/core/tensorbase.hh similarity index 94% rename from src/tensor_array/core/tensorbase.hh rename to src/tensor-array/core/tensorbase.hh index 0c0e863..e7c42d5 100644 --- a/src/tensor_array/core/tensorbase.hh +++ b/src/tensor-array/core/tensorbase.hh @@ -23,14 +23,14 @@ limitations under the License. #include "initializer_wrapper.hh" #pragma once -#ifdef __WIN32__ -#ifdef CUDA_ML_EXPORTS -#define CUDA_ML_API __declspec(dllexport) +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_CORE_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) #else -#define CUDA_ML_API __declspec(dllimport) +#define TENSOR_ARRAY_API __declspec(dllimport) #endif #else -#define CUDA_ML_API +#define TENSOR_ARRAY_API #endif namespace tensor_array @@ -40,7 +40,7 @@ namespace tensor_array /** * \brief This class look like std::any but it tensor. */ - class CUDA_ML_API TensorBase + class TENSOR_ARRAY_API TensorBase { private: struct TensorStorage @@ -64,7 +64,7 @@ namespace tensor_array class TensorArrayStorage final : public TensorStorage { private: - static constexpr const std::array dim_size_array{ sz0, sz... }; + static constexpr inline const unsigned int dim_size_array[sizeof...(sz) + 1ULL] = { sz0, sz... }; const TensorArray arr_data; public: constexpr TensorArrayStorage(const TensorArray& arr_data) : @@ -87,7 +87,7 @@ namespace tensor_array inline std::initializer_list dim_sizes() const override { - return wrapper::initializer_wrapper(dim_size_array.data(), dim_size_array.data() + sizeof...(sz) + 1ULL); + return wrapper::initializer_wrapper(dim_size_array, dim_size_array + sizeof...(sz) + 1ULL); } inline const void* data() const override @@ -210,7 +210,7 @@ namespace tensor_array }; std::size_t get_sizeof_type(const std::type_info&); -} + } } -#undef CUDA_ML_API \ No newline at end of file +#undef TENSOR_ARRAY_API diff --git a/src/tensor-array/interp/main.c b/src/tensor-array/interp/main.c new file mode 100644 index 0000000..7aa7285 --- /dev/null +++ b/src/tensor-array/interp/main.c @@ -0,0 +1,117 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include "option.h" +#include "open_file.h" +#include "parser.h" + +void initialize(int argc, char *argv[]) +{ + int i, fd; + while (argc <= 0) + { + char *argv_opt = ""; + size_t poolsize = 1024; // Default pool size + switch (argv_opt[0]) + { + case '-': + switch (argv_opt[1]) + { + case 'h': + help(); + return; + case 'v': + version(); + return; + case 'f': + if (argc < 2) + { + fprintf(stderr, "Error: No file specified after -f option\n"); + exit(1); + return; + } + read_file(argv[1]); + argc--; + argv++; + return; + case '-': + if (strcmp(argv_opt, "--help") == 0) + { + help(); + return; + } + else if (strcmp(argv_opt, "--version") == 0) + { + version(); + return; + } + else if (strcmp(argv_opt, "--poolsize") == 0) + { + if (argc < 2) + { + fprintf(stderr, "Error: No pool size specified after --poolsize option\n"); + exit(1); + return; + } + poolsize = atoi(argv[1]); + if (poolsize <= 0) + { + fprintf(stderr, "Error: Invalid pool size specified\n"); + exit(1); + return; + } + argc--; + argv++; + } + else if (strcmp(argv_opt, "--file") == 0) + { + if (argc < 2) + { + fprintf(stderr, "Error: No file specified after --file option\n"); + exit(1); + return; + } + read_file(argv[1]); + argc--; + argv++; + } + return; + default: + read_file(argv[0]); + return; + } + break; + default: + break; + } + argc--; + argv++; + } + +} + +int main(int argc, char *argv[]) +{ + printf("Hello\n"); + initialize(argc-1, argv+1); + program(); + return 0; +} +// Future implementations may include command-line argument parsing, initialization of the TensorArray library, +// and other necessary setup for the interp functionality. diff --git a/src/tensor-array/interp/open_file.c b/src/tensor-array/interp/open_file.c new file mode 100644 index 0000000..63a34f6 --- /dev/null +++ b/src/tensor-array/interp/open_file.c @@ -0,0 +1,79 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include "open_file.h" + +char *src = NULL; +VM_INSTRUCTION *text = NULL; +size_t poolsize = 1024; // Default pool size + +void interp_malloc() +{ + src = malloc(poolsize); + if (src == NULL) + { + fprintf(stderr, "Error: Could not allocate memory for interpreter\n"); + exit(1); + } + text = malloc(poolsize*sizeof(VM_INSTRUCTION)); + if (text == NULL) + { + fprintf(stderr, "Error: Could not allocate memory for interpreter text\n"); + free(src); + exit(1); + } +} + +void interp_memreset() +{ + memset(text, 0, poolsize); + memset(src, 0, poolsize); +} + +void interp_free() +{ + free(text); + free(src); +} + +void read_file(const char* filename) +{ + FILE* fptr = fopen(filename, "r"); + if (fptr == NULL) + { + fprintf(stderr, "Error: Could not open file %s\n", filename); + exit(1); + } + + int i; + interp_malloc(); + interp_memreset(); + i = fread(src, poolsize, 1, fptr); + if (i < 0) + { + fprintf(stderr, "Error: Could not read file %s\n", filename); + fclose(fptr); + exit(1); + } + orig = text; + text = text - 1; + src[i] = '\0'; + fclose(fptr); + +} diff --git a/src/tensor-array/interp/open_file.h b/src/tensor-array/interp/open_file.h new file mode 100644 index 0000000..7d37999 --- /dev/null +++ b/src/tensor-array/interp/open_file.h @@ -0,0 +1,27 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "vm_instruction.h" +#include "vm.h" + +extern char *src; +extern VM_INSTRUCTION *text; +extern size_t poolsize; + +void interp_malloc(); +void interp_memreset(); +void interp_free(); +void read_file(const char*); diff --git a/src/tensor-array/interp/option.c b/src/tensor-array/interp/option.c new file mode 100644 index 0000000..ef83c45 --- /dev/null +++ b/src/tensor-array/interp/option.c @@ -0,0 +1,32 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +void help() +{ + printf("Usage: tensor-array [options]\n"); + printf("Options:\n"); + printf(" -h, --help Show this help message\n"); + printf(" -v, --version Show version information\n"); + printf(" --poolsize SIZE Set the pool size (default: 1024)\n"); + printf(" -f, --file FILE Open the specified file\n"); +} + +void version() +{ + printf("Tensor Array Interpreter Version 0.1.0\n"); +} \ No newline at end of file diff --git a/src/tensor-array/interp/option.h b/src/tensor-array/interp/option.h new file mode 100644 index 0000000..168b613 --- /dev/null +++ b/src/tensor-array/interp/option.h @@ -0,0 +1,18 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +void help(); +void version(); diff --git a/src/tensor-array/interp/parser.c b/src/tensor-array/interp/parser.c new file mode 100644 index 0000000..0bd9d0c --- /dev/null +++ b/src/tensor-array/interp/parser.c @@ -0,0 +1,320 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include +#include "parser.h" +#include "token.h" +#include "open_file.h" +#include "vm_instruction.h" +#include "sym_map.h" +#include "vm_type.h" + +void emit(unsigned size, ...) +{ + va_list args; + va_start(args, size); + + // Process the variable arguments as needed + for (unsigned i = 0; i < size; ++i) { + ++text; + *text = va_arg(args, VM_INSTRUCTION); + } + + va_end(args); +} + +void match(int tk) +{ + if (tkn == tk) { + token_next(); // Move to the next token + } else { + if (tk < 0x80) { + fprintf(stderr, "Error: Expected token %ld but found %ld\n", tk, tkn); + } else { + char* tn = tknname[tk - 0x80]; + fprintf(stderr, "Error: Expected token %s but found %s\n", tn, tkn); + } + exit(1); + } +} + +void expression(int level) +{ + sym_data* temp = NULL; // Temporary variable to hold intermediate values + int isArrRef = 0; // Flag to check if we are dealing with an array reference + // This function would handle parsing and evaluating expressions + // For now, it is a placeholder + // You can implement your logic here + switch (tkn) + { + case TOKEN_NUM: + /* code */ + emit(3, IMM, TYPE_INT, tkn_val); + match(TOKEN_NUM); + break; + case TOKEN_ID: + /* code */ + temp = sym_cur; + match(TOKEN_ID); + if (temp->type) + { + if (tkn == '(') + { + /* code */ + match('('); + match(')'); + emit(2, CALL, temp->data); + } + + } + else + { + emit(3, IMM, TYPE_PTR, tkn_val); + emit(1, GET); + } + break; + case '"': + { + emit(3, IMM, TYPE_STRING, tkn_val); + match('"'); // Match the opening quote + } + break; + case '[': + if (temp == NULL) + { + *text = PTR_PUSH; // Push the current value onto the stack + match('['); // Match the opening bracket + expression(TOKEN_ASSIGN); // Parse the expression inside the brackets + emit(1, GETELEM); // Emit get element instruction + match(']'); // Match the closing bracket + } + break; + default: + break; + } + + while (tkn >= level) + { + switch (tkn) + { + case TOKEN_ASSIGN: + if (*text != GET && *text != GETELEM) + { + fprintf(stderr, "Error: Assignment without a variable\n"); + exit(1); + } + *text = PTR_PUSH; // Push the current value onto the stack + match(TOKEN_ASSIGN); + expression(TOKEN_ASSIGN); // Parse the right-hand side expression + if (isArrRef) emit(1, SETELEM); // Emit set element instruction if it's an array reference + else emit(1, SET); // Emit set instruction + break; + case TOKEN_EQ: + emit(1, PUSH); + match(TOKEN_EQ); + expression(TOKEN_SHL); // Parse the right-hand side expression + emit(1, EQ); // Emit equality instruction + break; + case TOKEN_NE: + emit(1, PUSH); + match(TOKEN_NE); + expression(TOKEN_SHL); // Parse the right-hand side expression + emit(1, NE); // Emit not equal instruction + break; + case TOKEN_LT: + emit(1, PUSH); + match(TOKEN_LT); + expression(TOKEN_SHL); // Parse the right-hand side expression + emit(1, LT); // Emit less than instruction + break; + case TOKEN_GT: + emit(1, PUSH); + match(TOKEN_GT); + expression(TOKEN_SHL); // Parse the right-hand side expression + emit(1, GT); // Emit greater than instruction + break; + case TOKEN_LE: + emit(1, PUSH); + match(TOKEN_LE); + expression(TOKEN_SHL); // Parse the right-hand side expression + emit(1, LE); // Emit less than or equal instruction + break; + case TOKEN_GE: + emit(1, PUSH); + match(TOKEN_GE); + expression(TOKEN_SHL); // Parse the right-hand side expression + emit(1, GE); // Emit greater than or equal instruction + break; + case TOKEN_SHL: + emit(1, PUSH); + match(TOKEN_SHL); + expression(TOKEN_ADD); // Parse the right-hand side expression + emit(1, SHL); // Emit shift left instruction + break; + case TOKEN_SHR: + emit(1, PUSH); + match(TOKEN_SHR); + expression(TOKEN_ADD); // Parse the right-hand side expression + emit(1, SHR); // Emit shift right instruction + break; + case TOKEN_ADD: + emit(1, PUSH); + match(TOKEN_ADD); + expression(TOKEN_MUL); // Parse the right-hand side expression + emit(1, ADD); // Emit add instruction + break; + case TOKEN_SUB: + emit(1, PUSH); + match(TOKEN_SUB); + expression(TOKEN_MUL); // Parse the right-hand side expression + emit(1, SUB); // Emit subtract instruction + break; + case TOKEN_MUL: + emit(1, PUSH); + match(TOKEN_MUL); + expression(TOKEN_MATMUL); // Parse the right-hand side expression + emit(1, MUL); // Emit multiply instruction + break; + case TOKEN_DIV: + emit(1, PUSH); + match(TOKEN_DIV); + expression(TOKEN_MATMUL); // Parse the right-hand side expression + emit(1, DIV); // Emit divide instruction + break; + case TOKEN_MATMUL: + emit(1, PUSH); + match(TOKEN_MATMUL); + expression(TOKEN_INC); // Parse the right-hand side expression + emit(1, MATMUL); // Emit matrix multiply instruction + break; + default: + fprintf(stderr, "Error: Unrecognized token in expression\n"); + exit(1); + } + } + +} + +void statement() +{ + // This function would handle parsing and executing statements + // For now, it is a placeholder + // You can implement your logic here + switch (tkn) + { + case TOKEN_IF: + { + match(TOKEN_IF); + match('('); + expression(TOKEN_ASSIGN); // Parse the condition expression + match(')'); + emit(1, JZ); // Emit jump if zero instruction + VM_INSTRUCTION *b = ++text; // Placeholder for jump address + statement(); // Parse the statement inside the if block + if (tkn == TOKEN_ELSE) + { + match(TOKEN_ELSE); + emit(1, JMP); // Emit jump instruction + *b = text + 2; // Set the jump address to the next instruction + b = ++text; + statement(); // Parse the else block + } + *b = text + 1; // Set the jump address to the next instruction + } + break; + case TOKEN_WHILE: + { + VM_INSTRUCTION *a = NULL; // Placeholder for jump address + VM_INSTRUCTION *b = text+1; // Placeholder for jump address + match(TOKEN_WHILE); + match('('); + expression(TOKEN_ASSIGN); // Parse the condition expression + match(')'); + emit(1, JZ); // Emit jump if zero instruction + a=++text; // Set the jump address to the start of the while block + statement(); // Parse the statement inside the while block + emit(1, JMP); // Emit jump instruction to loop back + emit(1, b); // Emit the address to jump back to + *a = text + 1; // Set the jump address to the next instruction + } + break; + case TOKEN_FUNC: + match(TOKEN_FUNC); + if (tkn != TOKEN_ID) + { + fprintf(stderr, "Error: function name\n"); + exit(1); + } + sym_cur->type = TYPE_FUNC; + sym_cur->data = malloc(1024*8); + VM_INSTRUCTION *save = text; + text = sym_cur->data; + match(TOKEN_ID); + match('('); + match(')'); + statement(); + if (*text != RET) emit(1, RET); + text = save; + break; + case TOKEN_RETURN: + match(TOKEN_RETURN); + expression(TOKEN_ASSIGN); + emit(1, RET); + break; + case '{': + match('{'); + while (tkn != '}') + statement(); + match('}'); + break; + case '\0': + return; + default: + expression(TOKEN_ASSIGN); + if (tkn == ';') + match(';'); + break; + } +} + +void program() +{ + while (1) + { + // This is a placeholder for the main program loop + // You would typically call emit or other functions here based on your program logic + // Add your program logic here + interp_malloc(); + orig = text+1; + char *isrc = src; + VM_INSTRUCTION *itext = text; + interp_memreset(); + printf(">>> "); + fflush(stdout); + fgets(src, poolsize-1, stdin); // Read input from stdin + token_next(); + statement(); + emit(1, EXIT); // Emit a token with value 0 to indicate end of processing + eval(); + printf("eval \n"); + puts(""); + free(itext); + free(isrc); + } +} diff --git a/src/tensor-array/interp/parser.h b/src/tensor-array/interp/parser.h new file mode 100644 index 0000000..021ba4a --- /dev/null +++ b/src/tensor-array/interp/parser.h @@ -0,0 +1,17 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +void program(); diff --git a/src/tensor-array/interp/sym_map.cc b/src/tensor-array/interp/sym_map.cc new file mode 100644 index 0000000..9794848 --- /dev/null +++ b/src/tensor-array/interp/sym_map.cc @@ -0,0 +1,38 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include "sym_map.h" + +sym_data* sym_cur = NULL; + +scope sym_map; + +void sym_data_set(char* name, sym_data dat) +{ + sym_map[name] = dat; +} + +sym_data* sym_data_get(char* name) +{ + return &sym_map[name]; +} + +int glob_data_find(char* name) +{ + return sym_map.find(name) != sym_map.end(); +} diff --git a/src/tensor-array/interp/sym_map.h b/src/tensor-array/interp/sym_map.h new file mode 100644 index 0000000..b60eeb0 --- /dev/null +++ b/src/tensor-array/interp/sym_map.h @@ -0,0 +1,40 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef __cplusplus +extern "C" +{ +#endif + typedef struct + { + int tkn; + int hash; + int cls; + int type; + void* data; // Pointer to additional data if needed + } sym_data; + void sym_data_set(char* name, sym_data dat); + sym_data* sym_data_get(char*); + int glob_data_find(char* name); + extern sym_data* sym_cur; +#ifdef __cplusplus +} + +#include +#include +typedef std::map scope; +extern scope sym_map; +#endif diff --git a/src/tensor-array/interp/token.c b/src/tensor-array/interp/token.c new file mode 100644 index 0000000..888150d --- /dev/null +++ b/src/tensor-array/interp/token.c @@ -0,0 +1,313 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include +#include "sym_map.h" +#include "open_file.h" +#include "token.h" + +int tkn = 0; +int tkn_val = 0; // Variable to hold the value of the current token + +char* tknname[] = { + "num", "sys", "glo", "loc", "id", + "func", "else", "enum", "if", "return", "sizeof", + "while", "assign", "cond", "lor", "lan", + "or", "xor", "and", + "eq", "ne", "lt", "gt", "le", "ge", + "add", "sub", "mul", "div", "matmul", "pos", "neg", "not", "brak" +}; +void token_next() +{ + while ((tkn = *src++) != '\0') + { + switch (tkn) + { + case ' ': + case '\t': + /* code */ + break; + case '#': + /* code */ + while (*src != '\n' && *src != '\0') src++; + break; + case '"': + case '\'': + { + char* last_pos = src; + while (*src != tkn && *src != '\0') src++; + char *st1 = malloc(src-last_pos+2); + memset(st1, 0, src-last_pos+2); + src = last_pos; + for (unsigned int i = 0; *src != tkn && *src != '\0'; i++) + { + tkn_val=*src++; + if (tkn_val == '\\') + { + switch (*++src) + { + case '\\': + tkn_val='\\'; + break; + case 'r': + tkn_val='\r'; + break; + case 'n': + tkn_val='\n'; + break; + default: + if (*src == tkn) tkn_val=tkn; + else + { + fprintf(stderr, "invalid character string."); + exit(1); + } + break; + } + } + st1[i] = tkn_val; + } + tkn_val = st1; // Store the start of the string literal + } + return; // Exit after processing the string literal + case '/': + switch (src[0]) + { + case '/': + /* code */ + while (*src != '\n' && *src != '\0') src++; + break; + case '*': + /* code */ + src++; + while (*src != '\0' && !(src[0] == '*' && src[1] == '/')) src++; + if (*src == '\0') { + fprintf(stderr, "Error: Unmatched comment block\n"); + exit(1); + } + src += 2; // Skip past the closing */ + break; + case '=': + src++; + tkn = TOKEN_DIV; // Store the token value + return; // Exit after processing the division operator + default: + tkn = TOKEN_DIV; // Store the token value + return; // Exit after processing the division operator + } + + case '*': + if (*src == '=') + { + src++; + tkn = TOKEN_MUL; // Store the token value + return; // Exit after processing the token + } + else + { + tkn = TOKEN_MUL; // Store the token value + return; // Exit after processing the token + } + case '+': + if (*src == '=') + { + src++; + tkn = TOKEN_ADD; // Store the token value + return; // Exit after processing the token + } + else + { + tkn = TOKEN_ADD; // Store the token value + return; // Exit after processing the token + } + case '-': + if (*src == '=') + { + src++; + tkn = TOKEN_SUB; // Store the token value + return; // Exit after processing the token + } + else + { + tkn = TOKEN_SUB; // Store the token value + return; // Exit after processing the token + } + case '=': + if (*src == '=') + { + src++; + tkn = TOKEN_EQ; // Store the token value + return; // Exit after processing the token + } + else + { + tkn = TOKEN_ASSIGN; // Store the token value + return; // Exit after processing the token + } + case '!': + if (*src == '=') + { + src++; + tkn = TOKEN_NE; // Store the token value + return; // Exit after processing the token + } + else + { + tkn = TOKEN_NOT; // Store the token value + return; // Exit after processing the token + } + case '<': + if (*src == '=') + { + src++; + tkn = TOKEN_LE; // Store the token value + return; // Exit after processing the token + } + else if (*src == '<') + { + src++; + tkn = TOKEN_SHL; // Store the token value + return; // Exit after processing the token + } + else + { + tkn = TOKEN_LT; // Store the token value + return; // Exit after processing the token + } + case '>': + if (*src == '=') + { + src++; + tkn = TOKEN_GE; // Store the token value + return; // Exit after processing the token + } + else if (*src == '>') + { + src++; + tkn = TOKEN_SHR; // Store the token value + return; // Exit after processing the token + } + else + { + tkn = TOKEN_GT; // Store the token value + return; // Exit after processing the token + } + case '&': + if (*src == '&') + { + src++; + tkn = TOKEN_AND; // Store the token value + return; // Exit after processing the token + } + else + { + fprintf(stderr, "Error: Unrecognized token '&'\n"); + exit(1); + } + case '|': + if (*src == '|') + { + src++; + tkn = TOKEN_LOR; // Store the token value + return; // Exit after processing the token + } + else + { + fprintf(stderr, "Error: Unrecognized token '|'\n"); + exit(1); + } + case '@': + src++; + tkn = TOKEN_MATMUL; // Store the token value + return; // Exit after processing the token + case '[': + case ']': + case '(': + case ')': + case '{': + case '}': + case ',': + case ';': + case ':': + return; + default: + if (tkn >= '0' && tkn <= '9') + { + tkn_val = tkn - '0'; + if (tkn == '0' && (*src == 'x' || *src == 'X')) + { + src++; + while ((*src >= '0' && *src <= '9') || (*src >= 'a' && *src <= 'f') || (*src >= 'A' && *src <= 'F')) + { + tkn_val = (tkn_val << 4) + (*src >= '0' && *src <= '9' ? *src - '0' : (*src >= 'a' && *src <= 'f' ? *src - 'a' + 10 : *src - 'A' + 10)); + src++; + } + /* code to handle hexadecimal number */ + } + else + { + while (*src >= '0' && *src <= '9') + { + tkn_val = (tkn_val * 10) + (*src - '0'); + src++; + } + /* code to handle decimal number */ + } + tkn = TOKEN_NUM; // Set the token type + return; // Exit after processing the number + } + else if ((tkn >= 'a' && tkn <= 'z') || (tkn >= 'A' && tkn <= 'Z') || tkn == '_') + { + char* last_pos = src - 1; + long hash = tkn; + while ((*src >= '0' && *src <= '9') || (*src >= 'a' && *src <= 'z') || (*src >= 'A' && *src <= 'Z') || *src == '_') + { + hash = (hash * 0x40) + *src; + src++; + } + int char_len = src-last_pos; + char *name = malloc(char_len+1); + memcpy(name, last_pos, char_len); + name[char_len] = '\0'; + tkn_val = name; + if (glob_data_find(name)) + { + /* code to handle existing identifier */ + tkn = sym_data_get(name)->tkn; // Set the token type from the existing identifier + return; // Exit after processing the existing identifier + } + /* code to handle identifiers */ + sym_data item; + item.hash = hash; + item.data = NULL; // Initialize data pointer if needed + + tkn = item.tkn = TOKEN_ID; // Set the token type + sym_data_set(name,item); + sym_cur = sym_data_get(name); + return; // Exit after processing the identifier + } + else + { + printf("invalid symbol %c", tkn); + } + break; + } + } + +} \ No newline at end of file diff --git a/src/tensor-array/interp/token.h b/src/tensor-array/interp/token.h new file mode 100644 index 0000000..835eaf5 --- /dev/null +++ b/src/tensor-array/interp/token.h @@ -0,0 +1,30 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +typedef enum +{ + TOKEN_NUM = 0x80, TOKEN_SYS, TOKEN_GLO, TOKEN_LOC, TOKEN_ID, + TOKEN_FUNC, TOKEN_ELSE, TOKEN_ENUM, TOKEN_IF, TOKEN_RETURN, TOKEN_SIZEOF, + TOKEN_WHILE, TOKEN_ASSIGN, TOKEN_COND, TOKEN_LOR, TOKEN_LAN, + TOKEN_OR, TOKEN_XOR, TOKEN_AND, TOKEN_SHL, TOKEN_SHR, + TOKEN_EQ, TOKEN_NE, TOKEN_LT, TOKEN_GT, TOKEN_LE, TOKEN_GE, TOKEN_POS, TOKEN_NEG, + TOKEN_ADD, TOKEN_SUB, TOKEN_MUL, TOKEN_DIV, TOKEN_MATMUL, TOKEN_INC, TOKEN_NOT +} TOKEN_TYPE; + +void token_next(); +extern int tkn; +extern int tkn_val; +extern char *tknname[]; diff --git a/src/tensor-array/interp/vm.c b/src/tensor-array/interp/vm.c new file mode 100644 index 0000000..edd4a72 --- /dev/null +++ b/src/tensor-array/interp/vm.c @@ -0,0 +1,214 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include "vm_instruction.h" +#include "vmop.h" +#include "vm.h" + +VM_INSTRUCTION* orig = NULL; +VM_INSTRUCTION* pc = NULL; + +void eval() +{ + printf("vmstart\n"); + VM_INSTRUCTION_V2 op; + pc = orig; + while (1) + { + op = *((VM_INSTRUCTION_V2*)pc++); + printf("vmopassign %ld %ld %ld \n", orig, pc, op); + switch (op) + { + case LEA: + // Load effective address + break; + case IMM: + // Immediate value + any_type = *pc++; + any_value = *pc++; + op_imm(); + break; + case JMP: + // Jump to address + pc = (VM_INSTRUCTION*) *pc; + break; + case CALL: + // Function call + break; + case JZ: + // Jump if zero + pc = (any_value) ? (VM_INSTRUCTION*)*pc : pc + 1; + break; + case JNZ: + // Jump if not zero + pc = (any_value) ? pc + 1 : (VM_INSTRUCTION*)*pc; + break; + case ENT: + // Enter function + break; + case ADJ: + // Adjust stack pointer + break; + case LEV: + // Leave function + break; + case RET: + // Return from function + return; + case LI: + // Load integer + break; + case LC: + // Load character + break; + case SI: + // Store integer + break; + case SC: + // Store character + break; + case SET: + // Set value + op_set(); + break; + case GET: + // Get value + op_get(); + break; + case PUSH: + // Push value onto stack + op_push(); + break; + case PTR_PUSH: + // Push value onto stack + op_ptr_push(); + break; + case GETELEM: + // Get element from array + break; + case SETELEM: + // Set element in array + break; + case ADDELEM: + // Add element to array + break; + case OR: + // Logical OR operation + op_or(); + break; + case XOR: + // Logical XOR operation + break; + case AND: + // Logical AND operation + op_and(); + break; + case EQ: + // Equality check + op_eq(); + break; + case NE: + // Not equal check + op_ne(); + break; + case LT: + // Less than check + op_lt(); + break; + case GT: + // Greater than check + op_gt(); + break; + case LE: + // Less than or equal check + op_le(); + break; + case GE: + // Greater than or equal check + op_ge(); + break; + case ADD: + // Addition operation + op_add(); + break; + case SUB: + // Subtraction operation + op_sub(); + break; + case MUL: + // Multiplication operation + op_mul(); + break; + case DIV: + // Division operation + op_div(); + break; + case MATMUL: + // Matrix multiplication operation + op_matmul(); + break; + case POS: + // Unary plus operation + op_pos(); + break; + case NEG: + // Unary minus operation + op_neg(); + break; + case NOT: + // Logical NOT operation + op_not(); + break; + case SHL: + // Shift left operation + break; + case SHR: + // Shift right operation + break; + case OPEN: + // Open file operation + op_open(); + break; + case READ: + // Read from file operation + break; + case CLOSE: + // Close file operation + break; + case PRTF: + // Print formatted output + break; + case MALC: + // Memory allocation operation + break; + case MSET: + // Memory set operation + break; + case MCMP: + // Memory compare operation + break; + case EXIT: + // Exit operation + op_exit(); + return; + default: + fprintf(stderr, "Unknown instruction: %d\n", op); + exit(1); + } + } +} diff --git a/src/tensor-array/interp/vm.h b/src/tensor-array/interp/vm.h new file mode 100644 index 0000000..6c1633e --- /dev/null +++ b/src/tensor-array/interp/vm.h @@ -0,0 +1,26 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +typedef enum +{ + LEA, IMM, JMP, CALL, JZ, JNZ, ENT, ADJ, LEV, RET, LI, LC, SI, SC, SET, GET, PUSH, PTR_PUSH, GETELEM, SETELEM, ADDELEM, + OR, XOR, AND, EQ, NE, LT, GT, LE, GE, ADD, SUB, MUL, DIV, MATMUL, POS, NEG, NOT, SHL, SHR, + OPEN, READ, CLOSE, PRTF, MALC, MSET, MCMP, EXIT +} VM_INSTRUCTION_V2; + +void eval(); + +extern size_t any_value; diff --git a/src/tensor_array/core/extern_type_map.hh b/src/tensor-array/interp/vm_instruction.h similarity index 75% rename from src/tensor_array/core/extern_type_map.hh rename to src/tensor-array/interp/vm_instruction.h index 9146a3e..29677f5 100644 --- a/src/tensor_array/core/extern_type_map.hh +++ b/src/tensor-array/interp/vm_instruction.h @@ -14,13 +14,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include -#include - -namespace tensor_array +#ifdef __cplusplus +extern "C" { - namespace value - { - extern std::unordered_map dynamic_type_size; - } +#endif + typedef size_t VM_INSTRUCTION; + extern VM_INSTRUCTION* orig; + extern VM_INSTRUCTION* pc; +#ifdef __cplusplus } +#endif \ No newline at end of file diff --git a/src/tensor-array/interp/vm_type.h b/src/tensor-array/interp/vm_type.h new file mode 100644 index 0000000..740016a --- /dev/null +++ b/src/tensor-array/interp/vm_type.h @@ -0,0 +1,31 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef __cplusplus +extern "C" +{ +#endif + typedef enum + { + TYPE_NONE, + TYPE_STRING, + TYPE_INT, + TYPE_PTR, + TYPE_FUNC + } VM_TYPE; +#ifdef __cplusplus +} +#endif diff --git a/src/tensor-array/interp/vmop.cc b/src/tensor-array/interp/vmop.cc new file mode 100644 index 0000000..022e672 --- /dev/null +++ b/src/tensor-array/interp/vmop.cc @@ -0,0 +1,317 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include +#include "sym_map.h" +#include "vm_instruction.h" +#include "vmop.h" +#include "vm_type.h" + +std::stack tensor_stack; +std::stack var_stack; +std::stack> call_stack; +tensor_array::value::Tensor ag; +void* aptr; +size_t any_value; +size_t any_type; + +void new_int() +{ + tensor_array::value::TensorArray tmp2 = {any_value}; + tensor_array::value::Tensor tmp1(tmp2); + ag = tmp1; +} + +void new_ptr() +{ + aptr = reinterpret_cast(any_value); +} + +void new_string() +{ + char* str = reinterpret_cast(any_value); + unsigned int s_len = std::strlen(str); + tensor_array::value::TensorBase tmp1(typeid(char),{s_len}, str); + ag = tmp1; + std::free(str); +} + +void op_imm() +{ + switch (any_type) + { + case TYPE_INT: + /* code */ + new_int(); + break; + case TYPE_STRING: + new_string(); + break; + case TYPE_PTR: + new_ptr(); + break; + default: + break; + } +} + +void op_add() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag += tensor_stack.top(); + tensor_stack.pop(); +} + +void op_sub() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag -= tensor_stack.top(); + tensor_stack.pop(); +} + +void op_mul() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag *= tensor_stack.top(); + tensor_stack.pop(); +} + +void op_div() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag /= tensor_stack.top(); + tensor_stack.pop(); +} + +void op_matmul() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = tensor_array::value::matmul(ag, tensor_stack.top()); + tensor_stack.pop(); +} + +void op_pos() +{ + ag = +ag; +} + +void op_neg() +{ + ag = -ag; +} + +void op_and() +{ + ag = ag && tensor_stack.top(); + tensor_stack.pop(); +} + +void op_or() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = ag || tensor_stack.top(); + tensor_stack.pop(); +} + +void op_not() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = !ag; +} + +void op_eq() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = ag == tensor_stack.top(); + tensor_stack.pop(); +} + +void op_ne() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = ag != tensor_stack.top(); + tensor_stack.pop(); +} + +void op_lt() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = ag < tensor_stack.top(); + tensor_stack.pop(); +} + +void op_gt() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = ag > tensor_stack.top(); + tensor_stack.pop(); +} + +void op_le() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = ag <= tensor_stack.top(); + tensor_stack.pop(); +} + +void op_ge() +{ + if (tensor_stack.empty()) + { + throw std::runtime_error("Tensor stack is empty"); + } + ag = ag >= tensor_stack.top(); + tensor_stack.pop(); +} + +void op_shl() +{ + // ag = ag << bg; +} + +void op_shr() +{ + // ag = ag >> bg; +} + +void op_call() +{ + VM_INSTRUCTION* pc1 = (VM_INSTRUCTION*)*pc++; + call_stack.push({std::move(pc), std::move(sym_map)}); + pc = pc1; +} + +void op_ret() +{ + pc = call_stack.top().first; + sym_map = std::move(call_stack.top().second); + call_stack.pop(); +} + +void op_open() +{ + // Implementation for opening a file or resource +} + +void op_read() +{ + // Implementation for reading from a file or resource +} + +void op_close() +{ + // Implementation for closing a file or resource +} + +void op_prtf() +{ + // Implementation for printing formatted output +} + +void op_malc() +{ + // Implementation for memory allocation +} + +void op_mset() +{ + // Implementation for setting memory +} + +void op_mcmp() +{ + // Implementation for memory comparison +} + +void op_exit() +{ + // Implementation for exiting the program + // std::cout << ag << std::endl; +} + +void op_push() +{ + tensor_stack.push(ag); +} + +void op_ptr_push() +{ + var_stack.push(reinterpret_cast(aptr)); + std::free(aptr); +} + +void op_get() +{ + char *var_name = reinterpret_cast(aptr); + sym_data& temp = sym_map[var_name]; + std::free(aptr); + ag = *reinterpret_cast(temp.data); +} + +void op_set() +{ + if (!var_stack.empty()) + { + std::string& var_name = var_stack.top(); + sym_data& temp = sym_map[var_name]; + delete temp.data; // Set the top of the stack to ag + temp.data = new tensor_array::value::Tensor(ag); + var_stack.pop(); + } + else + { + throw std::runtime_error("Tensor stack is empty"); + } +} diff --git a/src/tensor-array/interp/vmop.h b/src/tensor-array/interp/vmop.h new file mode 100644 index 0000000..a8a98e0 --- /dev/null +++ b/src/tensor-array/interp/vmop.h @@ -0,0 +1,57 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef __cplusplus +extern "C" +{ +#endif + extern size_t any_value; + extern size_t any_type; + void op_imm(); + void op_add(); + void op_sub(); + void op_mul(); + void op_div(); + void op_matmul(); + void op_pos(); + void op_neg(); + void op_and(); + void op_or(); + void op_not(); + void op_eq(); + void op_ne(); + void op_lt(); + void op_gt(); + void op_le(); + void op_ge(); + void op_shl(); + void op_shr(); + void op_open(); + void op_read(); + void op_close(); + void op_prtf(); + void op_malc(); + void op_mset(); + void op_mcmp(); + void op_exit(); + void op_push(); + void op_ptr_push(); + void op_get(); + void op_set(); +#ifdef __cplusplus +} +#endif + diff --git a/src/tensor_array/layers/attention.cc b/src/tensor-array/layers/attention.cc similarity index 100% rename from src/tensor_array/layers/attention.cc rename to src/tensor-array/layers/attention.cc diff --git a/src/tensor_array/layers/attention.hh b/src/tensor-array/layers/attention.hh similarity index 73% rename from src/tensor_array/layers/attention.hh rename to src/tensor-array/layers/attention.hh index 8cd7480..dbb4f3c 100644 --- a/src/tensor_array/layers/attention.hh +++ b/src/tensor-array/layers/attention.hh @@ -19,13 +19,23 @@ limitations under the License. #include "normalization.hh" #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { - value::Tensor CUDA_ML_API scaled_dot_product_attention(const value::Tensor&, const value::Tensor&, const value::Tensor&, const value::Tensor& = value::Tensor()); + value::Tensor TENSOR_ARRAY_API scaled_dot_product_attention(const value::Tensor&, const value::Tensor&, const value::Tensor&, const value::Tensor& = value::Tensor()); - class CUDA_ML_API MultiHeadAttentionImpl final : + class TENSOR_ARRAY_API MultiHeadAttentionImpl final : public LayerImpl { private: @@ -39,4 +49,6 @@ namespace tensor_array using MultiHeadAttention = LayerHolder; } -} \ No newline at end of file +} + +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/convolution.cc b/src/tensor-array/layers/convolution.cc similarity index 100% rename from src/tensor_array/layers/convolution.cc rename to src/tensor-array/layers/convolution.cc diff --git a/src/tensor_array/layers/convolution.hh b/src/tensor-array/layers/convolution.hh similarity index 86% rename from src/tensor_array/layers/convolution.hh rename to src/tensor-array/layers/convolution.hh index 644f1a7..e217a86 100644 --- a/src/tensor_array/layers/convolution.hh +++ b/src/tensor-array/layers/convolution.hh @@ -18,11 +18,21 @@ limitations under the License. #include "layer_utility.hh" #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { - class CUDA_ML_API ConvolutionLayerImpl : + class TENSOR_ARRAY_API ConvolutionLayerImpl : public TensorCalculateLayerImpl { protected: @@ -39,7 +49,7 @@ namespace tensor_array value::Tensor calculate(const value::Tensor&) override final; }; - class CUDA_ML_API Conv1D_Impl final : + class TENSOR_ARRAY_API Conv1D_Impl final : public ConvolutionLayerImpl { public: @@ -49,7 +59,7 @@ namespace tensor_array using Conv1D = LayerHolder; - class CUDA_ML_API Conv2D_Impl final : + class TENSOR_ARRAY_API Conv2D_Impl final : public ConvolutionLayerImpl { public: @@ -59,7 +69,7 @@ namespace tensor_array using Conv2D = LayerHolder; - class CUDA_ML_API Conv3D_Impl final : + class TENSOR_ARRAY_API Conv3D_Impl final : public ConvolutionLayerImpl { public: @@ -71,3 +81,4 @@ namespace tensor_array } } +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/layer_any.cc b/src/tensor-array/layers/layer_any.cc similarity index 100% rename from src/tensor_array/layers/layer_any.cc rename to src/tensor-array/layers/layer_any.cc diff --git a/src/tensor_array/layers/layer_any.hh b/src/tensor-array/layers/layer_any.hh similarity index 100% rename from src/tensor_array/layers/layer_any.hh rename to src/tensor-array/layers/layer_any.hh diff --git a/src/tensor_array/layers/layer_holder.hh b/src/tensor-array/layers/layer_holder.hh similarity index 100% rename from src/tensor_array/layers/layer_holder.hh rename to src/tensor-array/layers/layer_holder.hh diff --git a/src/tensor_array/layers/layer_impl.cc b/src/tensor-array/layers/layer_impl.cc similarity index 100% rename from src/tensor_array/layers/layer_impl.cc rename to src/tensor-array/layers/layer_impl.cc diff --git a/src/tensor_array/layers/layer_impl.hh b/src/tensor-array/layers/layer_impl.hh similarity index 87% rename from src/tensor_array/layers/layer_impl.hh rename to src/tensor-array/layers/layer_impl.hh index 7197c38..7852524 100644 --- a/src/tensor_array/layers/layer_impl.hh +++ b/src/tensor-array/layers/layer_impl.hh @@ -16,25 +16,26 @@ limitations under the License. #include #include -#include +#include #include -#pragma once -#ifdef __WIN32__ -#ifdef CUDA_ML_EXPORTS -#define CUDA_ML_API __declspec(dllexport) +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) #else -#define CUDA_ML_API __declspec(dllimport) +#define TENSOR_ARRAY_API __declspec(dllimport) #endif #else -#define CUDA_ML_API +#define TENSOR_ARRAY_API #endif +#pragma once + namespace tensor_array { namespace layers { - class CUDA_ML_API LayerImpl + class TENSOR_ARRAY_API LayerImpl { private: bool is_running = false; @@ -67,4 +68,6 @@ namespace tensor_array public CalculateStruct {}; } -} \ No newline at end of file +} + +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/layer_utility.cc b/src/tensor-array/layers/layer_utility.cc similarity index 100% rename from src/tensor_array/layers/layer_utility.cc rename to src/tensor-array/layers/layer_utility.cc diff --git a/src/tensor_array/layers/layer_utility.hh b/src/tensor-array/layers/layer_utility.hh similarity index 66% rename from src/tensor_array/layers/layer_utility.hh rename to src/tensor-array/layers/layer_utility.hh index a0966d9..5b901fd 100644 --- a/src/tensor_array/layers/layer_utility.hh +++ b/src/tensor-array/layers/layer_utility.hh @@ -18,13 +18,23 @@ limitations under the License. #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { typedef value::Tensor(*LayerFunction)(const value::Tensor&); - class CUDA_ML_API ActivationImpl final : + class TENSOR_ARRAY_API ActivationImpl final : public TensorCalculateLayerImpl { public: @@ -34,7 +44,7 @@ namespace tensor_array const LayerFunction func; }; - class CUDA_ML_API ReShapeImpl final : + class TENSOR_ARRAY_API ReShapeImpl final : public TensorCalculateLayerImpl { public: @@ -44,13 +54,15 @@ namespace tensor_array const std::vector shape; }; - value::Tensor CUDA_ML_API NoActivation(const value::Tensor&); - value::Tensor CUDA_ML_API ReLU(const value::Tensor&); - value::Tensor CUDA_ML_API tanh(const value::Tensor&); - value::Tensor CUDA_ML_API Sigmoid(const value::Tensor&); - value::Tensor CUDA_ML_API SoftMax(const value::Tensor&, unsigned char dim); + value::Tensor TENSOR_ARRAY_API NoActivation(const value::Tensor&); + value::Tensor TENSOR_ARRAY_API ReLU(const value::Tensor&); + value::Tensor TENSOR_ARRAY_API tanh(const value::Tensor&); + value::Tensor TENSOR_ARRAY_API Sigmoid(const value::Tensor&); + value::Tensor TENSOR_ARRAY_API SoftMax(const value::Tensor&, unsigned char dim); using Activation = LayerHolder; using ReShape = LayerHolder; } } + +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/linear.cc b/src/tensor-array/layers/linear.cc similarity index 100% rename from src/tensor_array/layers/linear.cc rename to src/tensor-array/layers/linear.cc diff --git a/src/tensor_array/layers/linear.hh b/src/tensor-array/layers/linear.hh similarity index 82% rename from src/tensor_array/layers/linear.hh rename to src/tensor-array/layers/linear.hh index 6d87fdd..45cbc4e 100644 --- a/src/tensor_array/layers/linear.hh +++ b/src/tensor-array/layers/linear.hh @@ -17,11 +17,21 @@ limitations under the License. #include "layer_holder.hh" #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { - class CUDA_ML_API LinearImpl final : + class TENSOR_ARRAY_API LinearImpl final : public TensorCalculateLayerImpl { private: @@ -43,4 +53,4 @@ namespace tensor_array } } - +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/normalization.cc b/src/tensor-array/layers/normalization.cc similarity index 100% rename from src/tensor_array/layers/normalization.cc rename to src/tensor-array/layers/normalization.cc diff --git a/src/tensor_array/layers/normalization.hh b/src/tensor-array/layers/normalization.hh similarity index 82% rename from src/tensor_array/layers/normalization.hh rename to src/tensor-array/layers/normalization.hh index 1afa8b6..6d8a8f7 100644 --- a/src/tensor_array/layers/normalization.hh +++ b/src/tensor-array/layers/normalization.hh @@ -17,11 +17,21 @@ limitations under the License. #include "layer_holder.hh" #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { - class CUDA_ML_API NormalizationImpl final : + class TENSOR_ARRAY_API NormalizationImpl final : public TensorCalculateLayerImpl { private: @@ -39,3 +49,4 @@ namespace tensor_array } } +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/recurrent.cc b/src/tensor-array/layers/recurrent.cc similarity index 100% rename from src/tensor_array/layers/recurrent.cc rename to src/tensor-array/layers/recurrent.cc diff --git a/src/tensor_array/layers/recurrent.hh b/src/tensor-array/layers/recurrent.hh similarity index 86% rename from src/tensor_array/layers/recurrent.hh rename to src/tensor-array/layers/recurrent.hh index 1041e92..721cd2a 100644 --- a/src/tensor_array/layers/recurrent.hh +++ b/src/tensor-array/layers/recurrent.hh @@ -18,11 +18,21 @@ limitations under the License. #include "layer_utility.hh" #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { - class CUDA_ML_API RecurrentImpl : + class TENSOR_ARRAY_API RecurrentImpl : public TensorCalculateLayerImpl { private: @@ -41,7 +51,7 @@ namespace tensor_array }; using Recurrent = LayerHolder; - class CUDA_ML_API LSTM_Impl : + class TENSOR_ARRAY_API LSTM_Impl : public TensorCalculateLayerImpl { private: @@ -60,3 +70,4 @@ namespace tensor_array } } +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/sequential.cc b/src/tensor-array/layers/sequential.cc similarity index 100% rename from src/tensor_array/layers/sequential.cc rename to src/tensor-array/layers/sequential.cc diff --git a/src/tensor_array/layers/sequential.hh b/src/tensor-array/layers/sequential.hh similarity index 79% rename from src/tensor_array/layers/sequential.hh rename to src/tensor-array/layers/sequential.hh index db7854a..cbe95e0 100644 --- a/src/tensor_array/layers/sequential.hh +++ b/src/tensor-array/layers/sequential.hh @@ -17,13 +17,23 @@ limitations under the License. #include "layer_holder.hh" #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { using LayerInSequential = LayerHolder; - class CUDA_ML_API SequentialImpl final : + class TENSOR_ARRAY_API SequentialImpl final : public TensorCalculateLayerImpl { private: @@ -37,7 +47,7 @@ namespace tensor_array value::Tensor calculate(const value::Tensor&) override; }; - class CUDA_ML_API Sequential : public LayerHolder + class TENSOR_ARRAY_API Sequential : public LayerHolder { public: Sequential() = default; @@ -46,3 +56,4 @@ namespace tensor_array } } +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/layers/transformer.cc b/src/tensor-array/layers/transformer.cc similarity index 100% rename from src/tensor_array/layers/transformer.cc rename to src/tensor-array/layers/transformer.cc diff --git a/src/tensor_array/layers/transformer.hh b/src/tensor-array/layers/transformer.hh similarity index 87% rename from src/tensor_array/layers/transformer.hh rename to src/tensor-array/layers/transformer.hh index c4bac31..a2c8257 100644 --- a/src/tensor_array/layers/transformer.hh +++ b/src/tensor-array/layers/transformer.hh @@ -17,11 +17,21 @@ limitations under the License. #include "attention.hh" #pragma once +#ifdef _WIN32 +#ifdef TENSOR_ARRAY_LAYERS_EXPORTS +#define TENSOR_ARRAY_API __declspec(dllexport) +#else +#define TENSOR_ARRAY_API __declspec(dllimport) +#endif +#else +#define TENSOR_ARRAY_API +#endif + namespace tensor_array { namespace layers { - class CUDA_ML_API TransformerEncoderImpl final : + class TENSOR_ARRAY_API TransformerEncoderImpl final : public TensorCalculateLayerImpl { private: @@ -37,7 +47,7 @@ namespace tensor_array using TransformerEncoder = LayerHolder; - class CUDA_ML_API TransformerDecoderImpl final : + class TENSOR_ARRAY_API TransformerDecoderImpl final : public LayerImpl { private: @@ -70,3 +80,4 @@ namespace tensor_array } } +#undef TENSOR_ARRAY_API diff --git a/src/tensor_array/core/CMakeLists.txt b/src/tensor_array/core/CMakeLists.txt deleted file mode 100644 index fe9cede..0000000 --- a/src/tensor_array/core/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -cmake_minimum_required(VERSION 3.8.0) - -enable_language(CUDA C CXX) - -set(CMAKE_CUDA_COMPILER nvcc) -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_ARCHITECTURES 52 75 89) -set(CMAKE_CUDA_SEPARABLE_COMPILATION TRUE) -set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G") # enable cuda-gdb - -file(GLOB TensorArray_src "*.cc" "*.cu") -file(GLOB TensorArray_inc "*.hh") - -# file(MAKE_DIRECTORY "include/tensor_array/core") - -install( - FILES ${TensorArray_inc} - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/tensor_array/core - COMPONENT headers) - -add_library(tensorarray_core SHARED ${TensorArray_src}) -add_library(TensorArray::Core ALIAS tensorarray_core) -target_link_libraries(tensorarray_core PRIVATE cublas) - -install( - TARGETS tensorarray_core - EXPORT TensorArrayTargets - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT libraries - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT libraries) diff --git a/src/tensor_array/core/initializer_wrapper.hh b/src/tensor_array/core/initializer_wrapper.hh deleted file mode 100644 index 7e8e116..0000000 --- a/src/tensor_array/core/initializer_wrapper.hh +++ /dev/null @@ -1,64 +0,0 @@ -/* -Copyright 2024 TensorArray-Creators - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include - -namespace tensor_array -{ - namespace wrapper - { - template - class initializer_wrapper - { - public: - typedef _E value_type; - typedef const _E& reference; - typedef const _E& const_reference; - typedef size_t size_type; - typedef const _E* iterator; - typedef const _E* const_iterator; - - private: -#ifdef __GNUC__ - iterator _M_array; - size_type _M_len; -#endif - - public: - constexpr initializer_wrapper(const_iterator __a, size_type __l) - : _M_array(__a), _M_len(__l) { } - - constexpr initializer_wrapper(const_iterator __begin, const_iterator __end) - : _M_array(__begin), _M_len(__end - __begin) { } - - constexpr initializer_wrapper() noexcept: _M_array(0), _M_len(0) { } - - // Number of elements. - constexpr size_type - size() const noexcept { return _M_len; } - - // First element. - constexpr const_iterator - begin() const noexcept { return _M_array; } - - // One past the last element. - constexpr const_iterator - end() const noexcept { return begin() + size(); } - - constexpr operator std::initializer_list<_E>() const { return reinterpret_cast&>(*this); } - }; - } -} \ No newline at end of file diff --git a/src/tensor_array/layers/CMakeLists.txt b/src/tensor_array/layers/CMakeLists.txt deleted file mode 100644 index cef320d..0000000 --- a/src/tensor_array/layers/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -cmake_minimum_required(VERSION 3.8.0) - -enable_language(C CXX) - -file(GLOB TensorArray_src "*.cc") -file(GLOB TensorArray_inc "*.hh") - -install( - FILES ${TensorArray_inc} - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/tensor_array/layers - COMPONENT headers) - -add_library(tensorarray_layers SHARED ${TensorArray_src}) -add_library(TensorArray::Layers ALIAS tensorarray_layers) - -target_include_directories(tensorarray_layers PRIVATE ${PROJECT_SOURCE_DIR}/src) -target_link_libraries(tensorarray_layers TensorArray::Core) - -install( - TARGETS tensorarray_layers - EXPORT TensorArrayTargets - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT libraries - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT libraries) diff --git a/tests/_ b/tests/_ new file mode 100644 index 0000000..e69de29 diff --git a/tests/tensor-array/core/CMakeLists.txt b/tests/tensor-array/core/CMakeLists.txt new file mode 100644 index 0000000..26cc477 --- /dev/null +++ b/tests/tensor-array/core/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required(VERSION 3.18) + +# project(TensorArray_tests) + +include(cmake/ta_core_tests.cmake) diff --git a/tests/tensor-array/core/cmake/ta_core_tests.cmake b/tests/tensor-array/core/cmake/ta_core_tests.cmake new file mode 100644 index 0000000..da6401d --- /dev/null +++ b/tests/tensor-array/core/cmake/ta_core_tests.cmake @@ -0,0 +1,25 @@ + +set( + TensorArray_tests_src + "tensor_array_test.cc" + "print_output.cc" + # "tensor_operators.cc" + # "tensor_matmul_transpose.cc" + # "gradient.cc" + ) + +enable_testing() + +create_test_sourcelist( + TensorArray_tests + "test_driver.cc" + ${TensorArray_tests_src}) + +add_executable(tensorarray_core_tests ${TensorArray_tests}) +target_include_directories(tensorarray_core_tests PRIVATE ${PROJECT_SOURCE_DIR}/src) +target_link_libraries(tensorarray_core_tests PUBLIC TensorArray::core) + +foreach(test ${TensorArray_tests_src}) + get_filename_component(TName ${test} NAME_WE) + add_test(NAME ${TName} COMMAND tensorarray_core_tests ${TName}) +endforeach() diff --git a/tests/tensor-array/core/gradient.cc b/tests/tensor-array/core/gradient.cc new file mode 100644 index 0000000..fbb5768 --- /dev/null +++ b/tests/tensor-array/core/gradient.cc @@ -0,0 +1,41 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include + +using namespace std; +using namespace tensor_array::value; + +int gradient(int argc, char *argv[]) +{ + TensorArray example_tensor_array = + {{ + {{ 1, 2, 3, 4 }}, + {{ 5, 6, 7, 8 }}, + {{ 9, 10, 11, 12 }}, + {{ 13, 14, 15, 16 }}, + }}; + TensorArray example_tensor_array_scalar = {100}; + Tensor example_tensor_1(example_tensor_array); + Tensor example_tensor_2(example_tensor_array_scalar); + Tensor example_tensor_sum = example_tensor_1 + example_tensor_2; + cout << example_tensor_sum << endl; + example_tensor_sum.calc_grad(); + cout << example_tensor_1.get_grad() << endl; + cout << example_tensor_2.get_grad() << endl; + return 0; +} diff --git a/tests/tensor-array/core/print_output.cc b/tests/tensor-array/core/print_output.cc new file mode 100644 index 0000000..ad1c9be --- /dev/null +++ b/tests/tensor-array/core/print_output.cc @@ -0,0 +1,36 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include + +using namespace std; +using namespace tensor_array::value; + +int print_output(int argc, char *argv[]) +{ + /* code */ + TensorArray example_tensor_array = + {{ + {{ 1, 2, 3, 4 }}, + {{ 5, 6, 7, 8 }}, + {{ 9, 10, 11, 12 }}, + {{ 13, 14, 15, 16 }}, + }}; + Tensor example_tensor_1(example_tensor_array); + cout << example_tensor_1 << endl; + return 0; +} diff --git a/tests/tensor-array/core/tensor_array_test.cc b/tests/tensor-array/core/tensor_array_test.cc new file mode 100644 index 0000000..039ed01 --- /dev/null +++ b/tests/tensor-array/core/tensor_array_test.cc @@ -0,0 +1,31 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +using namespace tensor_array::value; + +int tensor_array_test(int argc, char *argv[]) +{ + TensorArray example_tensor_array = + {{ + {{ 1, 2, 3, 4 }}, + {{ 5, 6, 7, 8 }}, + {{ 9, 10, 11, 12 }}, + {{ 13, 14, 15, 16 }}, + }}; + return 0; +} diff --git a/tests/tensor-array/core/tensor_matmul_transpose.cc b/tests/tensor-array/core/tensor_matmul_transpose.cc new file mode 100644 index 0000000..f101971 --- /dev/null +++ b/tests/tensor-array/core/tensor_matmul_transpose.cc @@ -0,0 +1,33 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +using namespace tensor_array::value; + +int tensor_matmul_transpose(int argc, char *argv[]) +{ + TensorArray example_tensor_array = + {{ + {{ 1, 2, 3 }}, + {{ 4, 5, 6 }} + }}; + TensorArray example_tensor_array_scalar = {100}; + Tensor example_tensor_1(example_tensor_array); + Tensor example_tensor_2 = example_tensor_1.transpose(0, 1); + Tensor example_tensor_add = matmul(example_tensor_1, example_tensor_2); + return 0; +} diff --git a/tests/tensor-array/core/tensor_operators.cc b/tests/tensor-array/core/tensor_operators.cc new file mode 100644 index 0000000..9a840d4 --- /dev/null +++ b/tests/tensor-array/core/tensor_operators.cc @@ -0,0 +1,39 @@ +/* +Copyright 2024 TensorArray-Creators + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +using namespace tensor_array::value; + +int tensor_operators(int argc, char *argv[]) +{ + TensorArray example_tensor_array = + {{ + {{ 1, 2, 3, 4 }}, + {{ 5, 6, 7, 8 }}, + {{ 9, 10, 11, 12 }}, + {{ 13, 14, 15, 16 }}, + }}; + TensorArray example_tensor_array_scalar = {100}; + Tensor example_tensor_1(example_tensor_array); + Tensor example_tensor_2(example_tensor_array_scalar); + Tensor example_tensor_add = example_tensor_1 + example_tensor_2; + Tensor example_tensor_sub = example_tensor_1 - example_tensor_2; + Tensor example_tensor_mul = example_tensor_1 * example_tensor_2; + Tensor example_tensor_div = example_tensor_1 + example_tensor_2; + return 0; +} + diff --git a/tests/tensor-array/layers/_ b/tests/tensor-array/layers/_ new file mode 100644 index 0000000..e69de29