diff --git a/.devcontainer/CPU/Dockerfile b/.devcontainer/CPU/Dockerfile
new file mode 100644
index 0000000..3f47760
--- /dev/null
+++ b/.devcontainer/CPU/Dockerfile
@@ -0,0 +1,18 @@
+FROM mcr.microsoft.com/devcontainers/cpp:1-ubuntu-20.04
+
+ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9"
+
+# Optionally install the cmake for vcpkg
+COPY scripts/packages-install/reinstall-cmake.sh /tmp/
+
+RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \
+        chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \
+    fi \
+    && rm -f /tmp/reinstall-cmake.sh
+
+# [Optional] Uncomment this section to install additional vcpkg ports.
+# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install <your-port-name-here>"
+
+# [Optional] Uncomment this section to install additional packages.
+# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+#     && apt-get -y install --no-install-recommends <your-package-list-here>
diff --git a/.devcontainer/CPU/devcontainer.json b/.devcontainer/CPU/devcontainer.json
new file mode 100644
index 0000000..4c1d146
--- /dev/null
+++ b/.devcontainer/CPU/devcontainer.json
@@ -0,0 +1,33 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
+{
+	"name": "C++",
+	"build": {
+		"context": "../..",
+		"dockerfile": "Dockerfile"
+	},
+	
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cpptools-extension-pack",
+				"ms-vscode.cmake-tools"
+			]
+		}
+	}
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "gcc -v",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/.devcontainer/CUDA/Dockerfile b/.devcontainer/CUDA/Dockerfile
new file mode 100644
index 0000000..d7df2de
--- /dev/null
+++ b/.devcontainer/CUDA/Dockerfile
@@ -0,0 +1,24 @@
+FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubuntu20.04
+
+RUN apt-get update
+RUN apt-get upgrade -y
+RUN apt-get install curl -y
+
+ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9"
+
+# Optionally install the cmake for vcpkg
+COPY scripts/packages-install/reinstall-cmake.sh /tmp/
+
+RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \
+        chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \
+    fi \
+    && rm -f /tmp/reinstall-cmake.sh
+
+
+
+# [Optional] Uncomment this section to install additional vcpkg ports.
+# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install <your-port-name-here>"
+
+# [Optional] Uncomment this section to install additional packages.
+# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+#     && apt-get -y install --no-install-recommends <your-package-list-here>
diff --git a/.devcontainer/CUDA/devcontainer.json b/.devcontainer/CUDA/devcontainer.json
new file mode 100644
index 0000000..a59e314
--- /dev/null
+++ b/.devcontainer/CUDA/devcontainer.json
@@ -0,0 +1,43 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
+{
+	"name": "CUDA",
+	"build": {
+		"context": "../..",
+		"dockerfile": "Dockerfile"
+	},
+
+	"runArgs": [
+		"--gpus",
+		"all"
+	],
+
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cpptools-extension-pack",
+				"ms-vscode.cmake-tools"
+			]
+		}
+	},
+
+	"hostRequirements": {
+		"gpu": "optional"
+	},
+
+	"features": {
+		"ghcr.io/devcontainers/features/git:1": {}
+	}
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "gcc -v",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/.devcontainer/ROCm/Dockerfile b/.devcontainer/ROCm/Dockerfile
new file mode 100644
index 0000000..ee8c283
--- /dev/null
+++ b/.devcontainer/ROCm/Dockerfile
@@ -0,0 +1,24 @@
+FROM rocm/dev-ubuntu-20.04:latest
+
+RUN apt-get update
+RUN apt-get upgrade -y
+RUN apt-get install curl -y
+
+ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9"
+
+# Optionally install the cmake for vcpkg
+COPY scripts/packages-install/reinstall-cmake.sh /tmp/
+
+RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \
+        chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \
+    fi \
+    && rm -f /tmp/reinstall-cmake.sh
+
+
+
+# [Optional] Uncomment this section to install additional vcpkg ports.
+# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install <your-port-name-here>"
+
+# [Optional] Uncomment this section to install additional packages.
+# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+#     && apt-get -y install --no-install-recommends <your-package-list-here>
diff --git a/.devcontainer/ROCm/devcontainer.json b/.devcontainer/ROCm/devcontainer.json
new file mode 100644
index 0000000..302e224
--- /dev/null
+++ b/.devcontainer/ROCm/devcontainer.json
@@ -0,0 +1,43 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
+{
+	"name": "ROCm",
+	"build": {
+		"context": "../..",
+		"dockerfile": "Dockerfile"
+	},
+
+	"runArgs": [
+		"--gpus",
+		"all"
+	],
+
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cpptools-extension-pack",
+				"ms-vscode.cmake-tools"
+			]
+		}
+	},
+
+	"hostRequirements": {
+		"gpu": "optional"
+	},
+
+	"features": {
+		"ghcr.io/devcontainers/features/git:1": {}
+	}
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "gcc -v",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
deleted file mode 100644
index 08d2ac4..0000000
--- a/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "image": "mcr.microsoft.com/devcontainers/universal:2",
-  "features": {
-    "ghcr.io/devcontainers/features/nvidia-cuda:1": {
-      "instalNvtx": true,
-      "installToolkit": true,
-      "cudaVersion": "12.3"
-    }
-  }
-}
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..3805f8a
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,7 @@
+build/
+lib/
+bin/
+include/
+build-temp/
+.git/
+.vscode/
diff --git a/.github/workflows/cmake-single-platform.yml b/.github/workflows/cmake-multi-platform.yml
similarity index 55%
rename from .github/workflows/cmake-single-platform.yml
rename to .github/workflows/cmake-multi-platform.yml
index e69bac7..7347fe3 100644
--- a/.github/workflows/cmake-single-platform.yml
+++ b/.github/workflows/cmake-multi-platform.yml
@@ -1,6 +1,6 @@
 # This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
 # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
-name: CMake on a single platform
+name: CMake on multi platform
 
 on:
   push:
@@ -17,23 +17,52 @@ jobs:
     # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
     # You can convert this to a matrix build if you need cross-platform coverage.
     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        cuda-version: [ "12.3" ]
+        os: [ "ubuntu-22.04", "ubuntu-22.04-arm" ]
+        cuda-version: [ "12.9", "12.4", "11.8" ]
+        rocm-enabled: [ false ]
+        include:
+        - os: "ubuntu-22.04"
+          cuda-version: "11.8"
+          rocm-enabled: true
+          rocm-version: "6.4.2"
+        - os: "windows-latest"
+          cuda-version: "12.4.1"
+
+    runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v3
+
+    - name: Run ROCm bash shell Ubuntu/Debian
+      if: (startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'debian')) && matrix.rocm-enabled
+      env:
+        temp: ${{ runner.temp }}
+        rocm: ${{ matrix.rocm-version }}
+      run: |
+        chmod +x ${{github.workspace}}/scripts/actions/install-rocm-ubuntu.sh
+        ${{github.workspace}}/scripts/actions/install-rocm-ubuntu.sh
+      shell: bash
     
-    - name: Run CUDA bash shell
+    - name: Run CUDA bash shell Ubuntu/Debian
+      if: startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'debian')
       env:
         temp: ${{ runner.temp }}
         cuda: ${{ matrix.cuda-version }}
       run: |
-        chmod +x ./scripts/actions/install-cuda-ubuntu.sh
-        ./scripts/actions/install-cuda-ubuntu.sh
+        chmod +x ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh
+        ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh
       shell: bash
 
+    - name: Run CUDA bash shell Windows
+      if: runner.os == 'Windows'
+      env:
+        temp: ${{ runner.temp }}
+        cuda: ${{ matrix.cuda-version }}
+      run: scripts/actions/install-cuda-windows.ps1
+      shell: pwsh
+
     - name: Configure CMake
       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
@@ -44,7 +73,8 @@ jobs:
       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
 
     - name: Test
-      working-directory: ${{github.workspace}}/build
+      working-directory: ${{github.workspace}}/
+      if: runner.os != 'Windows'
       # Execute tests defined by the CMake configuration.
       # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
       run: ctest -C ${{env.BUILD_TYPE}}
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 89997e7..601cc8d 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -27,7 +27,7 @@ jobs:
     #   - https://gh.io/supported-runners-and-hardware-resources
     #   - https://gh.io/using-larger-runners
     # Consider using larger runners for possible analysis time improvements.
-    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-22.04' }}
     timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
     permissions:
       actions: read
@@ -38,7 +38,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'c-cpp' ]
-        cuda-version: [ "12.3" ]
+        cuda-version: [ "12.9" ]
         # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ]
         # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both
         # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
@@ -53,8 +53,8 @@ jobs:
         temp: ${{ runner.temp }}
         cuda: ${{ matrix.cuda-version }}
       run: |
-        chmod +x ./scripts/actions/install-cuda-ubuntu.sh
-        ./scripts/actions/install-cuda-ubuntu.sh
+        chmod +x ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh
+        ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh
       shell: bash
 
     # Initializes the CodeQL tools for scanning.
diff --git a/.github/workflows/deploy-webpages.yml b/.github/workflows/deploy-webpages.yml
new file mode 100644
index 0000000..3edbb5e
--- /dev/null
+++ b/.github/workflows/deploy-webpages.yml
@@ -0,0 +1,88 @@
+# This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
+# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
+name: CMake on a single platform (deploy web pages)
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+  workflow_dispatch:
+
+env:
+  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
+  BUILD_TYPE: Release
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
+    # You can convert this to a matrix build if you need cross-platform coverage.
+    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+    runs-on: ubuntu-22.04
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Setup Pages
+      uses: actions/configure-pages@v5
+    
+    - name: Run CUDA bash shell Ubuntu/Debian
+      env:
+        temp: ${{ runner.temp }}
+        cuda: "12.9"
+      run: |
+        chmod +x ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh
+        ${{github.workspace}}/scripts/actions/install-cuda-ubuntu.sh
+      shell: bash
+
+    - name: Install Doxygen
+      run: |
+        sudo apt-get update
+        sudo apt-get install doxygen
+      shell: bash
+
+    - name: Configure CMake
+      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
+      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
+      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+
+    - name: Build
+      # Build your program with the given configuration
+      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+
+    - name: Test
+      working-directory: ${{github.workspace}}/build
+      # Execute tests defined by the CMake configuration.
+      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+      run: ctest -C ${{env.BUILD_TYPE}}
+
+    - name: Upload GitHub Pages artifact
+      uses: actions/upload-pages-artifact@v3
+      with:
+        # Path of the directory containing the static assets.
+        path: "build/html/" # default is _site/
+        
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    
+    steps:
+    - name: Deploy to GitHub Pages
+      id: deployment
+      uses: actions/deploy-pages@v4
+
diff --git a/.github/workflows/docker-publish-d.yml b/.github/workflows/docker-publish-d.yml
new file mode 100644
index 0000000..2a6ead5
--- /dev/null
+++ b/.github/workflows/docker-publish-d.yml
@@ -0,0 +1,52 @@
+name: Publish Docker image
+
+on:
+  release:
+    type: [published]
+
+jobs:
+
+  push_to_registry:
+    name: Push Docker image
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        image-tag: [ "CUDA-12.9.1-Ubuntu-2004", "CUDA-12.9.1-ubi8" ]
+    permissions:
+      packages: write
+      contents: read
+      attestations: write
+      id-token: write
+    steps:
+    - name: Check out the repo
+      uses: actions/checkout@v4
+
+    - name: Docker Login
+      uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+
+    - name: Docker Metadata action
+      id: meta
+      uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
+      with:
+        image: ${{ secrets.DOCKER_USERNAME }}/tensor-array
+        tags: ${{ matrix.image-tag }}
+
+    - name: Build and push Docker images
+      id: push
+      uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83
+      with:
+        context: .
+        file: ./Dockerfile
+        push: true
+        tags: ${{ steps.meta.outputs.tags }}
+        labels: ${{ steps.meta.outputs.labels }}
+
+    - name: Attest Build Provenance
+      uses: actions/attest-build-provenance@v2
+      with:
+        subject-name: index.docker.io/${{ secrets.DOCKER_USERNAME }}/tensor-array
+        subject-digest: ${{ steps.push.outputs.digest }}
+        push-to-registry: true
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
new file mode 100644
index 0000000..05715a9
--- /dev/null
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,105 @@
+name: Docker
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+on:
+  schedule:
+    - cron: '32 2 * * *'
+  push:
+    branches: [ "master" ]
+    # Publish semver tags as releases.
+    tags: [ 'v*.*.*' ]
+  pull_request:
+    branches: [ "master" ]
+
+env:
+  # Use docker.io for Docker Hub if empty
+  REGISTRY: ghcr.io
+  # github.repository as <account>/<repo>
+  IMAGE_NAME: ${{ github.repository }}
+
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        image-tag: [ "CUDA-12.9.1-Ubuntu-20.04", "CUDA-12.9.1-ubi8" ]
+    
+    runs-on: ubuntu-22.04
+    
+    permissions:
+      contents: read
+      packages: write
+      # This is used to complete the identity challenge
+      # with sigstore/fulcio when running outside of PRs.
+      id-token: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      # Install the cosign tool except on PR
+      # https://github.com/sigstore/cosign-installer
+      - name: Install cosign
+        if: github.event_name != 'pull_request'
+        uses: sigstore/cosign-installer@59acb6260d9c0ba8f4a2f9d9b48431a222b68e20 #v3.5.0
+        with:
+          cosign-release: 'v2.2.4'
+
+      # Set up BuildKit Docker container builder to be able to build
+      # multi-platform images and export cache
+      # https://github.com/docker/setup-buildx-action
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@f95db51fddba0c2d1ec667646a06c2ce06100226 # v3.0.0
+
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Log into registry ${{ env.REGISTRY }}
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # Extract metadata (tags, labels) for Docker
+      # https://github.com/docker/metadata-action
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # v5.0.0
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=raw,value=${{ matrix.image-tag }}
+
+      # Build and push Docker image with Buildx (don't push on PR)
+      # https://github.com/docker/build-push-action
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0
+        with:
+          file: Dockerfolder/${{ matrix.image-tag }}.Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      # Sign the resulting Docker image digest except on PRs.
+      # This will only write to the public Rekor transparency log when the Docker
+      # repository is public to avoid leaking data.  If you would like to publish
+      # transparency data even for private images, pass --force to cosign below.
+      # https://github.com/sigstore/cosign
+      - name: Sign the published Docker image
+        if: ${{ github.event_name != 'pull_request' }}
+        env:
+          # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
+          TAGS: ${{ steps.meta.outputs.tags }}
+          DIGEST: ${{ steps.build-and-push.outputs.digest }}
+        # This step uses the identity token to provision an ephemeral certificate
+        # against the sigstore community Fulcio instance.
+        run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
diff --git a/.github/workflows/msvc.yml b/.github/workflows/msvc.yml
new file mode 100644
index 0000000..d4f9072
--- /dev/null
+++ b/.github/workflows/msvc.yml
@@ -0,0 +1,74 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+#
+# Find more information at:
+# https://github.com/microsoft/msvc-code-analysis-action
+
+name: Microsoft C++ Code Analysis
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+  schedule:
+    - cron: '37 5 * * 4'
+
+env:
+  # Path to the CMake build directory.
+  build: '${{ github.workspace }}/build'
+
+permissions:
+  contents: read
+
+jobs:
+  analyze:
+    permissions:
+      contents: read # for actions/checkout to fetch code
+      security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
+      actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status
+    name: Analyze
+    runs-on: windows-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        
+      - name: Run CUDA bash shell Windows
+        env:
+          temp: ${{ runner.temp }}
+          cuda: "12.4.1"
+        run: scripts/actions/install-cuda-windows.ps1
+        shell: pwsh
+
+
+      - name: Configure CMake
+        run: cmake -B ${{ env.build }}
+
+      # Build is not required unless generated source files are used
+      # - name: Build CMake
+      #   run: cmake --build ${{ env.build }}
+
+      - name: Initialize MSVC Code Analysis
+        uses: microsoft/msvc-code-analysis-action@04825f6d9e00f87422d6bf04e1a38b1f3ed60d99
+        # Provide a unique ID to access the sarif output path
+        id: run-analysis
+        with:
+          cmakeBuildDirectory: ${{ env.build }}
+          # Ruleset file that will determine what checks will be run
+          ruleset: NativeRecommendedRules.ruleset
+
+      # Upload SARIF file to GitHub Code Scanning Alerts
+      - name: Upload SARIF to GitHub
+        uses: github/codeql-action/upload-sarif@v3
+        with:
+          sarif_file: ${{ steps.run-analysis.outputs.sarif }}
+
+      # Upload SARIF file as an Artifact to download and view
+      # - name: Upload SARIF as an Artifact
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: sarif-file
+      #     path: ${{ steps.run-analysis.outputs.sarif }}
diff --git a/.gitignore b/.gitignore
index bc78b51..f4f2205 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,6 @@ lib/
 bin/
 include/
 build-temp/
+.vs/
+out/
+CMakeSettings.json
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..4bdb4f5
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,50 @@
+# This file is a template, and might need editing before it works on your project.
+# This is a sample GitLab CI/CD configuration file that should run without any modifications.
+# It demonstrates a basic 3 stage CI/CD pipeline. Instead of real tests or scripts,
+# it uses echo commands to simulate the pipeline execution.
+#
+# A pipeline is composed of independent jobs that run scripts, grouped into stages.
+# Stages run in sequential order, but jobs within stages run in parallel.
+#
+# For more information, see: https://docs.gitlab.com/ee/ci/yaml/#stages
+#
+# You can copy and paste this template into a new `.gitlab-ci.yml` file.
+# You should not add this template to an existing `.gitlab-ci.yml` file by using the `include:` keyword.
+#
+# To contribute improvements to CI/CD templates, please follow the Development guide at:
+# https://docs.gitlab.com/development/cicd/templates/
+# This specific template is located at:
+# https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/gitlab/ci/templates/Getting-Started.gitlab-ci.yml
+
+stages:          # List of stages for jobs, and their order of execution
+  - build
+  - test
+  - deploy
+
+build-job:       # This job runs in the build stage, which runs first.
+  stage: build
+  script:
+    - echo "Compiling the code..."
+    - echo "Compile complete."
+
+unit-test-job:   # This job runs in the test stage.
+  stage: test    # It only starts when the job in the build stage completes successfully.
+  script:
+    - echo "Running unit tests... This will take about 60 seconds."
+    - sleep 60
+    - echo "Code coverage is 90%"
+
+lint-test-job:   # This job also runs in the test stage.
+  stage: test    # It can run at the same time as unit-test-job (in parallel).
+  script:
+    - echo "Linting code... This will take about 10 seconds."
+    - sleep 10
+    - echo "No lint issues found."
+
+deploy-job:      # This job runs in the deploy stage.
+  stage: deploy  # It only runs when *both* jobs in the test stage complete successfully.
+  environment: production
+  script:
+    - echo "Deploying application..."
+    - echo "Application successfully deployed."
+
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
index d224842..84bab62 100644
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -6,7 +6,8 @@
                 "${workspaceFolder}/src"
             ],
             "defines": [],
-            "compilerPath": "/usr/bin/g++"
+            "compilerPath": "/usr/bin/g++",
+            "configurationProvider": "ms-vscode.cmake-tools"
         },
         {
             "name": "Windows",
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 821aaac..bbbdf48 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -59,6 +59,13 @@
         "stop_token": "cpp",
         "streambuf": "cpp",
         "thread": "cpp",
-        "typeindex": "cpp"
+        "typeindex": "cpp",
+        "cassert": "cpp",
+        "stack": "cpp",
+        "*.h": "c",
+        "map": "cpp",
+        "optional": "cpp",
+        "fstream": "cpp",
+        "set": "cpp"
     }
 }
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 652e3b9..c698055 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,30 +1,38 @@
-cmake_minimum_required(VERSION 3.8.0)
+cmake_minimum_required(VERSION 3.18)
 
-project(TensorArray)
+project(TensorArray C CXX)
 
 include(GNUInstallDirs)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/build-temp/archive)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/build-temp/library)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/build-temp/runtime)
-set(CMAKE_OUTPUT ${CMAKE_CURRENT_LIST_DIR}/build-temp/bin)
-set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR})
+# set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR})
 
-add_subdirectory("src/tensor_array/core")
-add_subdirectory("src/tensor_array/layers")
+include(cmake/ta_core_config.cmake)
+include(cmake/ta_layers_config.cmake)
+include(cmake/ta_interp_config.cmake)
+
+# add_subdirectory("src/tensor-array/core")
+# add_subdirectory("src/tensor-array/layers")
+# add_subdirectory("src/tensor-array/interp")
+
+include(CTest)
+if(BUILD_TESTING)
+    add_subdirectory("tests/tensor-array/core")
+endif()
+
+include(cmake/ta_add_doxygen.cmake)
 
 set(CPACK_PACKAGE_NAME "TensorArray")
 set(CPACK_PACKAGE_VENDOR "TensorArray-Creators")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY)
 
-set(CPACK_PACKAGE_VERSION "0.1.0")
+set(CPACK_PACKAGE_VERSION "0.2.0")
 set(CPACK_PACKAGE_VERSION_MAJOR "0")
-set(CPACK_PACKAGE_VERSION_MINOR "1")
+set(CPACK_PACKAGE_VERSION_MINOR "2")
 set(CPACK_PACKAGE_VERSION_PATCH "0")
 set(CPACK_PACKAGE_INSTALL_DIRECTORY "A machine learning libraries")
 
 install(
     EXPORT TensorArrayTargets
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/TensorArray
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/tensor-array/cmake
     NAMESPACE TensorArray::
     FILE TensorArrayTargets.cmake
     )
@@ -33,13 +41,13 @@ include(CMakePackageConfigHelpers)
 configure_package_config_file(
     "Config.cmake.in"
     "TensorArrayConfig.cmake"
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/TensorArray
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/tensor-array/cmake
     PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR
     )
 
 write_basic_package_version_file(
     ${CMAKE_CURRENT_BINARY_DIR}/TensorArrayConfigVersion.cmake
-    VERSION 0.1.0
+    VERSION 0.2.0
     COMPATIBILITY SameMajorVersion
     )
 
@@ -48,7 +56,7 @@ install(
     FILES
     ${CMAKE_CURRENT_BINARY_DIR}/TensorArrayConfig.cmake
     ${CMAKE_CURRENT_BINARY_DIR}/TensorArrayConfigVersion.cmake
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/TensorArray
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/tensor-array/cmake
     )
 
 include(CPack)
diff --git a/Dockerfolder/CUDA-12.9.1-Ubuntu-20.04.Dockerfile b/Dockerfolder/CUDA-12.9.1-Ubuntu-20.04.Dockerfile
new file mode 100644
index 0000000..e8226e3
--- /dev/null
+++ b/Dockerfolder/CUDA-12.9.1-Ubuntu-20.04.Dockerfile
@@ -0,0 +1,35 @@
+FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubuntu20.04
+
+RUN apt-get update
+RUN apt-get upgrade -y
+RUN apt-get install curl -y
+
+ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9"
+
+# Optionally install the cmake for vcpkg
+COPY scripts/packages-install/reinstall-cmake-ubuntu.sh /tmp/
+
+RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \
+        chmod +x /tmp/reinstall-cmake-ubuntu.sh && /tmp/reinstall-cmake-ubuntu.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \
+    fi \
+    && rm -f /tmp/reinstall-cmake-ubuntu.sh
+
+
+# [Optional] Uncomment this section to install additional vcpkg ports.
+# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install <your-port-name-here>"
+
+# [Optional] Uncomment this section to install additional packages.
+# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+#     && apt-get -y install --no-install-recommends <your-package-list-here>
+
+WORKDIR /main-project
+COPY ./ tensor-array/
+
+WORKDIR tensor-array/build
+
+RUN cmake ..
+RUN cmake --build .
+RUN cmake --install .
+# RUN ctest
+
+WORKDIR ..
diff --git a/Dockerfolder/CUDA-12.9.1-ubi8.Dockerfile b/Dockerfolder/CUDA-12.9.1-ubi8.Dockerfile
new file mode 100644
index 0000000..5bbd1ec
--- /dev/null
+++ b/Dockerfolder/CUDA-12.9.1-ubi8.Dockerfile
@@ -0,0 +1,35 @@
+FROM nvcr.io/nvidia/cuda:12.9.1-devel-ubi8
+
+RUN dnf update -y
+RUN dnf upgrade -y
+RUN dnf install curl -y
+
+ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.27.9"
+
+# Optionally install the cmake for vcpkg
+COPY scripts/packages-install/reinstall-cmake-rhel.sh /tmp/
+
+RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \
+        chmod +x /tmp/reinstall-cmake-rhel.sh && /tmp/reinstall-cmake-rhel.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \
+    fi \
+    && rm -f /tmp/reinstall-cmake-rhel.sh
+
+
+# [Optional] Uncomment this section to install additional vcpkg ports.
+# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install <your-port-name-here>"
+
+# [Optional] Uncomment this section to install additional packages.
+# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+#     && apt-get -y install --no-install-recommends <your-package-list-here>
+
+WORKDIR /main-project
+COPY ./ tensor-array/
+
+WORKDIR tensor-array/build
+
+RUN cmake ..
+RUN cmake --build .
+RUN cmake --install .
+# RUN ctest
+
+WORKDIR ..
diff --git a/README.md b/README.md
index c845edb..0ee20a1 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,39 @@
 # Tensor-Array
 
 ![C++](https://img.shields.io/badge/C%2B%2B-17-blue)
+[![Docker Image Size with architecture (latest by date/latest semver)](https://img.shields.io/docker/image-size/noobwastaken/tensor-array)
+](https://hub.docker.com/repository/docker/noobwastaken/tensor-array/general)
+
 
 A C++ Tensor library that can be used to work with machine learning or deep learning project.
 
 Build your own neural network models with this library.
 
+## Installing `Tensor-Array`
+
+You need to clone repository by using [Git](https://git-scm.com/)
 
+You need to install `Tensor-Array` with [CMake](https://cmake.org/)
+
+```shell
+git clone https://github.com/Tensor-Array/Tensor-Array.git
+cd Tensor-Array
+mkdir build
+cd build
+cmake ..
+cmake --build .
+cmake --install .
+cd ..
+```
 
 ## Why this repository named `Tensor-Array`
 
 We created a template struct that named `TensorArray`. That struct is a multi-dimensional array wrapper.
 
 ```C++
-#include "tensor_array/core/tensorbase.hh"
+#include <tensor-array/core/tensorbase.hh>
 
-using tensor_array::value;
+using namespace tensor_array::value;
 
 int main()
 {
@@ -59,13 +77,14 @@ The `Tensor::get_grad()` method can get the gradient after call `Tensor::calc_gr
 
 ```C++
 #include <iostream>
-#include "tensor_array/core/tensor.hh"
+#include <tensor-array/core/tensor.hh>
 
-using tensor_array::value;
+using namespace std;
+using namespace tensor_array::value;
 
 int main()
 {
-  tensor_array::value::TensorArray<float, 4, 4> example_tensor_array =
+  TensorArray<float, 4, 4> example_tensor_array =
   {{
     {{ 1, 2, 3, 4 }},
     {{ 5, 6, 7, 8 }},
@@ -76,13 +95,11 @@ int main()
   Tensor example_tensor_1(example_tensor_array);
   Tensor example_tensor_2(example_tensor_array_scalar);
   Tensor example_tensor_sum = example_tensor_1 + example_tensor_2;
-  std::cout << example_tensor_sum << std::endl;
+  cout << example_tensor_sum << endl;
   example_tensor_sum.calc_grad();
-  std::cout << example_tensor_1.get_grad() << std::endl;
-  std::cout << example_tensor_2.get_grad() << std::endl;
+  cout << example_tensor_1.get_grad() << endl;
+  cout << example_tensor_2.get_grad() << endl;
   return 0;
 }
 
 ```
-
-
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..1ae7a71
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,11 @@
+# Security Policy
+
+## Supported Versions
+
+| Version | Supported |
+| ------- | ------------------ |
+| 2025    | :white_check_mark: |
+| 2024    | :x: |
+| 2023    | :x: |
+
+## Reporting a Vulnerability
diff --git a/cmake/ta_add_doxygen.cmake b/cmake/ta_add_doxygen.cmake
new file mode 100644
index 0000000..94a1bf6
--- /dev/null
+++ b/cmake/ta_add_doxygen.cmake
@@ -0,0 +1,11 @@
+find_package(Doxygen)
+
+if(Doxygen_FOUND)
+    set(DOXYGEN_GENERATE_HTML YES)
+    set(DOXYGEN_FILE_PATTERNS *.c *.cc *.h *.hh *.cu *.md)
+    set(DOXYGEN_EXTENSION_MAPPING "*.cu=c++")
+    set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "${PROJECT_SOURCE_DIR}/README.md")
+    set(DOXYGEN_EXCLUDE_PATTERNS ".*/*" "build/*" "cmake/*" "scripts/*" "CMakeLists.txt" "*.cmake" "*.cmake.*")
+
+    doxygen_add_docs(tensorarray_docs ${PROJECT_SOURCE_DIR} ALL)
+endif()
diff --git a/cmake/ta_core_config.cmake b/cmake/ta_core_config.cmake
new file mode 100644
index 0000000..e1a2c2a
--- /dev/null
+++ b/cmake/ta_core_config.cmake
@@ -0,0 +1,94 @@
+set(TensorArray_Core_Dir tensor-array/core)
+
+file(
+    GLOB TensorArray_Core_inc
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.h"
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.hh"
+)
+
+install(
+    FILES ${TensorArray_Core_inc}
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${TensorArray_Core_Dir}"
+    COMPONENT headers
+)
+
+include(CheckLanguage)
+check_language(CUDA)
+
+file(GLOB TensorArray_Core_cc "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.cc")
+
+if (CMAKE_CUDA_COMPILER)
+    file(GLOB TensorArray_Core_cu "${PROJECT_SOURCE_DIR}/src/${TensorArray_Core_Dir}/*.cu")
+endif()
+
+if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    add_library(tensorarray_core_object OBJECT ${TensorArray_Core_cc} ${TensorArray_Core_cu})
+    set_property(TARGET tensorarray_core_object PROPERTY CUDA_STANDARD 17)
+    set_property(TARGET tensorarray_core_object PROPERTY CUDA_STANDARD_REQUIRED ON)
+    set_property(TARGET tensorarray_core_object PROPERTY CUDA_EXTENSIONS OFF)
+    set_property(TARGET tensorarray_core_object PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+    target_include_directories(tensorarray_core_object PRIVATE $<$<COMPILE_LANGUAGE:C,CXX>:${CUDAToolkit_INCLUDE_DIRS}>)
+    if(MSVC)
+        target_compile_definitions(tensorarray_core_object PRIVATE TENSOR_ARRAY_CORE_EXPORTS)
+    endif()
+        
+        # find_package(CUDAToolkit REQUIRED)
+        # set(CMAKE_CUDA_ARCHITECTURES 52 75 89)
+        # set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+        # list(APPEND CMAKE_CUDA_FLAGS "--default-stream per-thread")
+else()
+    add_library(tensorarray_core_object OBJECT ${TensorArray_Core_cc})
+endif()
+
+# file(MAKE_DIRECTORY "include/tensor_array/core")
+
+set_property(TARGET tensorarray_core_object PROPERTY C_STANDARD 11)
+set_property(TARGET tensorarray_core_object PROPERTY C_STANDARD_REQUIRED ON)
+set_property(TARGET tensorarray_core_object PROPERTY C_EXTENSIONS OFF)
+
+set_property(TARGET tensorarray_core_object PROPERTY CXX_STANDARD 17)
+set_property(TARGET tensorarray_core_object PROPERTY CXX_STANDARD_REQUIRED ON)
+set_property(TARGET tensorarray_core_object PROPERTY CXX_EXTENSIONS OFF)
+
+# shared libraries need PIC
+set_property(TARGET tensorarray_core_object PROPERTY POSITION_INDEPENDENT_CODE 1)
+
+# shared and static libraries built from the same object files
+add_library(tensorarray_core SHARED $<TARGET_OBJECTS:tensorarray_core_object>)
+add_library(tensorarray_core_static STATIC $<TARGET_OBJECTS:tensorarray_core_object>)
+
+if(CUDAToolkit_FOUND)
+    set_property(TARGET tensorarray_core PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(
+        tensorarray_core
+        PRIVATE $<$<LINK_LANGUAGE:C,CXX>:CUDA::cublas>
+        )
+endif()
+
+install(
+    TARGETS tensorarray_core
+    EXPORT TensorArrayTargets
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    COMPONENT Runtime
+    LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Runtime
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Development
+)
+
+install(
+    TARGETS tensorarray_core_static
+    EXPORT TensorArrayTargets
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    COMPONENT Runtime
+    LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Runtime
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Development
+)
+
+add_library(TensorArray::core ALIAS tensorarray_core)
+add_library(TensorArray::core_static ALIAS tensorarray_core_static)
+add_library(TensorArray::core_object ALIAS tensorarray_core_object)
diff --git a/cmake/ta_interp_config.cmake b/cmake/ta_interp_config.cmake
new file mode 100644
index 0000000..9f43e0d
--- /dev/null
+++ b/cmake/ta_interp_config.cmake
@@ -0,0 +1,50 @@
+set(TensorArray_Interpreter_Dir tensor-array/interp)
+
+file(
+    GLOB TensorArray_Interpreter_inc
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.h"
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.hh"
+)
+
+install(
+    FILES ${TensorArray_Interpreter_inc}
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${TensorArray_Interpreter_Dir}"
+    COMPONENT headers
+)
+
+file(
+    GLOB TensorArray_Interpreter_src
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.c"
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Interpreter_Dir}/*.cc"
+)
+add_executable(tensorarray_interpreter ${TensorArray_Interpreter_src})
+
+target_include_directories(tensorarray_interpreter PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_link_libraries(tensorarray_interpreter PUBLIC TensorArray::core)
+
+set_property(TARGET tensorarray_interpreter PROPERTY C_STANDARD 11)
+set_property(TARGET tensorarray_interpreter PROPERTY C_STANDARD_REQUIRED ON)
+set_property(TARGET tensorarray_interpreter PROPERTY C_EXTENSIONS OFF)
+
+set_property(TARGET tensorarray_interpreter PROPERTY CXX_STANDARD 17)
+set_property(TARGET tensorarray_interpreter PROPERTY CXX_STANDARD_REQUIRED ON)
+set_property(TARGET tensorarray_interpreter PROPERTY CXX_EXTENSIONS OFF)
+
+install(
+    TARGETS tensorarray_interpreter
+    EXPORT TensorArrayTargets
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    COMPONENT Runtime
+    LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Runtime
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Development
+)
+    #[[
+    add_custom_command(
+        OUTPUT test.tmp
+        DEPENDS tensorarray_interpreter
+        POST_BUILD
+        COMMAND tensorarray_interpreter)
+    ]]
+add_executable(TensorArray::interpreter ALIAS tensorarray_interpreter)
diff --git a/cmake/ta_layers_config.cmake b/cmake/ta_layers_config.cmake
new file mode 100644
index 0000000..d0961a9
--- /dev/null
+++ b/cmake/ta_layers_config.cmake
@@ -0,0 +1,66 @@
+set(TensorArray_Layers_Dir tensor-array/layers)
+
+file(
+    GLOB TensorArray_Layers_inc
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Layers_Dir}/*.h"
+    "${PROJECT_SOURCE_DIR}/src/${TensorArray_Layers_Dir}/*.hh"
+)
+
+install(
+    FILES ${TensorArray_Layers_inc}
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${TensorArray_Layers_Dir}"
+    COMPONENT headers
+)
+
+file(GLOB TensorArray_Layers_src "${PROJECT_SOURCE_DIR}/src/${TensorArray_Layers_Dir}/*.cc")
+
+add_library(tensorarray_layers_object OBJECT ${TensorArray_Layers_src})
+
+target_include_directories(tensorarray_layers_object PRIVATE ${PROJECT_SOURCE_DIR}/src)
+
+set_property(TARGET tensorarray_layers_object PROPERTY C_STANDARD 11)
+set_property(TARGET tensorarray_layers_object PROPERTY C_STANDARD_REQUIRED ON)
+set_property(TARGET tensorarray_layers_object PROPERTY C_EXTENSIONS OFF)
+
+set_property(TARGET tensorarray_layers_object PROPERTY CXX_STANDARD 17)
+set_property(TARGET tensorarray_layers_object PROPERTY CXX_STANDARD_REQUIRED ON)
+set_property(TARGET tensorarray_layers_object PROPERTY CXX_EXTENSIONS OFF)
+
+# shared libraries need PIC
+set_property(TARGET tensorarray_layers_object PROPERTY POSITION_INDEPENDENT_CODE 1)
+
+if(MSVC)
+    target_compile_definitions(tensorarray_layers_object PRIVATE TENSOR_ARRAY_LAYERS_EXPORTS)
+endif()
+
+# shared and static libraries built from the same object files
+add_library(tensorarray_layers SHARED $<TARGET_OBJECTS:tensorarray_layers_object>)
+add_library(tensorarray_layers_static STATIC $<TARGET_OBJECTS:tensorarray_layers_object>)
+
+target_link_libraries(tensorarray_layers PUBLIC TensorArray::core)
+
+install(
+    TARGETS tensorarray_layers
+    EXPORT TensorArrayTargets
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    COMPONENT Runtime
+    LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Runtime
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Development
+)
+
+install(
+    TARGETS tensorarray_layers_static
+    EXPORT TensorArrayTargets
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    COMPONENT Runtime
+    LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Runtime
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}/tensor-array"
+    COMPONENT Development
+)
+
+add_library(TensorArray::layers ALIAS tensorarray_layers)
+add_library(TensorArray::layers_static ALIAS tensorarray_layers_static)
+add_library(TensorArray::layers_object ALIAS tensorarray_layers_object)
diff --git a/scripts/actions/install-cuda-rhel.sh b/scripts/actions/install-cuda-rhel.sh
new file mode 100644
index 0000000..65e8b1b
--- /dev/null
+++ b/scripts/actions/install-cuda-rhel.sh
@@ -0,0 +1,152 @@
+CUDA_PACKAGES_IN=(
+    "cuda-compiler"
+    "cuda-cudart-devel"
+    "cuda-nvtx"
+    "cuda-nvrtc"
+    "cuda-cccl"
+    "libcurand-devel"
+    "libcublas-devel"
+    "libcufft-devel"
+)
+
+
+function version_ge() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$2" ]
+}
+# returns 0 (true) if a > b
+function version_gt() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$1" = "$2" ] && return 1 || version_ge $1 $2
+}
+# returns 0 (true) if a <= b
+function version_le() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$1" ]
+}
+# returns 0 (true) if a < b
+function version_lt() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$1" = "$2" ] && return 1 || version_le $1 $2
+}
+
+
+LINUX_ID=$(lsb_release -si)
+LINUX_ID="${LINUX_ID,,}"
+
+LINUX_VERSION=$(lsb_release -sr)
+LINUX_VERSION="${LINUX_VERSION//.}"
+
+LINUX_VERSION_MAJOR_MINOR=$(lsb_release -sr)
+LINUX_MAJOR=$(echo "${LINUX_VERSION_MAJOR_MINOR}" | cut -d. -f1)
+LINUX_MINOR=$(echo "${LINUX_VERSION_MAJOR_MINOR}" | cut -d. -f2)
+LINUX_PATCH=$(echo "${LINUX_VERSION_MAJOR_MINOR}" | cut -d. -f3)
+
+YUM_PACKAGE_MANAGER="yum"
+YUM_CONFIG_MANAGER="yum-config-manager"
+
+if [[ "${LINUX_ID}" == "almalinux" || "${LINUX_ID}" == "centos" || "${LINUX_ID}" == "oracle" ]]; then
+    echo "LINUX_ID: ${LINUX_ID} change to rhel"
+    LINUX_ID="rhel"
+    if [[ "${LINUX_MAJOR}" -ge "8" ]]; then
+        YUM_PACKAGE_MANAGER="dnf"
+        YUM_CONFIG_MANAGER="dnf config-manager"
+    fi
+    LINUX_VERSION=${LINUX_MAJOR}
+fi
+
+LOCATION_TEMP=${temp}
+
+CUDA_VERSION_MAJOR_MINOR=${cuda}
+
+CUDA_MAJOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f1)
+CUDA_MINOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f2)
+CUDA_PATCH=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f3)
+
+CPU_ARCH=$(uname -m)
+if [[ "${CPU_ARCH}" == "aarch64" ]]
+then
+    CPU_ARCH="sbsa"
+fi
+
+for package in "${CUDA_PACKAGES_IN[@]}"
+do : 
+    # @todo This is not perfect. Should probably provide a separate list for diff versions
+    # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y
+    if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1"
+    then
+        package="cuda-compiler"
+    elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1"
+    then
+        package="cuda-nvcc"
+    # CUB/Thrust  are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+
+    elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]
+    then
+        # CUDA cuda-thrust >= 11.4
+        if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4"
+        then
+            package="cuda-cccl"
+        # Use cuda-thrust > 11.2
+        elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3"
+        then
+            package="cuda-thrust"
+        # Do not include this pacakge < 11.3
+        else
+            continue
+        fi
+    fi
+    # Build the full package name and append to the string.
+    CUDA_PACKAGES+=" ${package}-${CUDA_MAJOR}-${CUDA_MINOR}"
+done
+echo "CUDA_PACKAGES ${CUDA_PACKAGES}"
+
+REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/cuda-${LINUX_ID}${LINUX_VERSION}.repo"
+
+is_root=false
+if (( $EUID == 0))
+then
+   is_root=true
+fi
+# Find if sudo is available
+has_sudo=false
+if command -v sudo &> /dev/null
+then
+    has_sudo=true
+fi
+# Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. 
+if [ "$is_root" = false ] && [ "$has_sudo" = false ]
+then 
+    echo "Root or sudo is required. Aborting."
+    exit 1
+elif [ "$is_root" = false ]
+then
+    USE_SUDO=sudo
+else
+    USE_SUDO=
+fi
+
+echo "Adding CUDA Repository"
+$USE_SUDO $YUM_CONFIG_MANAGER --add-repo ${REPO_URL}
+$USE_SUDO $YUM_PACKAGE_MANAGER clean all
+
+$USE_SUDO $YUM_PACKAGE_MANAGER install -y ${CUDA_PACKAGES}
+
+if [[ $? -ne 0 ]]
+then
+    echo "CUDA Installation Error."
+    exit 1
+fi
+
+CUDA_PATH=/usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
+echo "CUDA_PATH=${CUDA_PATH}"
+export CUDA_PATH=${CUDA_PATH}
+export PATH="$PATH:$CUDA_PATH/bin"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib64"
+
+if [[ $GITHUB_ACTIONS ]]
+then
+    echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
+    echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
+    echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+fi
diff --git a/scripts/actions/install-cuda-ubuntu.sh b/scripts/actions/install-cuda-ubuntu.sh
index 45d85dd..34a507f 100644
--- a/scripts/actions/install-cuda-ubuntu.sh
+++ b/scripts/actions/install-cuda-ubuntu.sh
@@ -3,10 +3,10 @@ CUDA_PACKAGES_IN=(
     "cuda-cudart"
     "cuda-nvtx"
     "cuda-nvrtc"
+    "cuda-cccl"
     "libcurand-dev"
     "libcublas-dev"
     "libcufft-dev"
-    "cuda-cccl"
 )
 
 function version_ge() {
@@ -44,22 +44,34 @@ CUDA_MAJOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f1)
 CUDA_MINOR=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f2)
 CUDA_PATCH=$(echo "${CUDA_VERSION_MAJOR_MINOR}" | cut -d. -f3)
 
+CPU_ARCH=$(uname -m)
+if [[ "${CPU_ARCH}" == "aarch64" ]]
+then
+    CPU_ARCH="sbsa"
+fi
+
+
 CUDA_PACKAGES=""
 for package in "${CUDA_PACKAGES_IN[@]}"
 do : 
     # @todo This is not perfect. Should probably provide a separate list for diff versions
     # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y
-    if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
+    if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1"
+    then
         package="cuda-compiler"
-    elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
+    elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1"
+    then
         package="cuda-nvcc"
     # CUB/Thrust  are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+
-    elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]; then
+    elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]
+    then
         # CUDA cuda-thrust >= 11.4
-        if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" ; then
+        if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4"
+        then
             package="cuda-cccl"
         # Use cuda-thrust > 11.2
-        elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" ; then
+        elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3"
+        then
             package="cuda-thrust"
         # Do not include this pacakge < 11.3
         else
@@ -71,27 +83,30 @@ do :
 done
 echo "CUDA_PACKAGES ${CUDA_PACKAGES}"
 
-CPU_ARCH="x86_64"
 PIN_FILENAME="cuda-${LINUX_ID}${LINUX_VERSION}.pin"
 PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/${PIN_FILENAME}"
-KERYRING_PACKAGE_FILENAME="cuda-keyring_1.0-1_all.deb"
+KERYRING_PACKAGE_FILENAME="cuda-keyring_1.1-1_all.deb"
 KEYRING_PACKAGE_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/${KERYRING_PACKAGE_FILENAME}"
 REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/${LINUX_ID}${LINUX_VERSION}/${CPU_ARCH}/"
 
 is_root=false
-if (( $EUID == 0)); then
+if (( $EUID == 0))
+then
    is_root=true
 fi
 # Find if sudo is available
 has_sudo=false
-if command -v sudo &> /dev/null ; then
+if command -v sudo &> /dev/null
+then
     has_sudo=true
 fi
 # Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. 
-if [ "$is_root" = false ] && [ "$has_sudo" = false ]; then 
+if [ "$is_root" = false ] && [ "$has_sudo" = false ]
+then 
     echo "Root or sudo is required. Aborting."
     exit 1
-elif [ "$is_root" = false ] ; then
+elif [ "$is_root" = false ]
+then
     USE_SUDO=sudo
 else
     USE_SUDO=
@@ -106,7 +121,8 @@ $USE_SUDO apt-get update
 
 $USE_SUDO apt-get -y install ${CUDA_PACKAGES}
 
-if [[ $? -ne 0 ]]; then
+if [[ $? -ne 0 ]]
+then
     echo "CUDA Installation Error."
     exit 1
 fi
@@ -114,14 +130,14 @@ fi
 CUDA_PATH=/usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
 echo "CUDA_PATH=${CUDA_PATH}"
 export CUDA_PATH=${CUDA_PATH}
-export PATH="$CUDA_PATH/bin:$PATH"
-export LD_LIBRARY_PATH="$CUDA_PATH/lib:$LD_LIBRARY_PATH"
-export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH"
+export PATH="$PATH:$CUDA_PATH/bin"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_PATH/lib64"
 
-if [[ $GITHUB_ACTIONS ]]; then
-    # Set paths for subsequent steps, using ${CUDA_PATH}
+if [[ $GITHUB_ACTIONS ]]
+then
     echo "Adding CUDA to CUDA_PATH, PATH and LD_LIBRARY_PATH"
     echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
-    echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
-    echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+    echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
+    echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
 fi
diff --git a/scripts/actions/install-cuda-windows.ps1 b/scripts/actions/install-cuda-windows.ps1
new file mode 100644
index 0000000..c47930b
--- /dev/null
+++ b/scripts/actions/install-cuda-windows.ps1
@@ -0,0 +1,56 @@
+$CUDA_PACKAGES_IN = @(
+    "nvcc"
+    "visual_studio_integration"
+    "cudart"
+    "nvtx"
+    "nvrtc"
+    "thrust"
+    "curand_dev"
+    "cublas_dev"
+    "cufft_dev"
+)
+
+function Version-Ge($a, $b) {
+    return ([version]$a -ge [version]$b)
+}
+function Version-Gt($a, $b) {
+    return ([version]$a -gt [version]$b)
+}
+function Version-Le($a, $b) {
+    return ([version]$a -le [version]$b)
+}
+function Version-Lt($a, $b) {
+    return ([version]$a -lt [version]$b)
+}
+
+# Expect $env:cuda to be set, e.g. "12.4.1"
+$CUDA_VERSION_MAJOR_MINOR = $env:cuda
+
+$parts = $CUDA_VERSION_MAJOR_MINOR.Split('.')
+$CUDA_MAJOR = $parts[0]
+$CUDA_MINOR = $parts[1]
+$CUDA_PATCH = if ($parts.Count -gt 2) { $parts[2] } else { "0" }
+
+$CUDA_PACKAGES = ""
+foreach ($package in $CUDA_PACKAGES_IN) {
+    $CUDA_PACKAGES += " ${package}_${CUDA_MAJOR}.${CUDA_MINOR}"
+}
+Write-Host "CUDA_PACKAGES $CUDA_PACKAGES"
+
+$cudaInstallerUrl = "https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION_MAJOR_MINOR/network_installers/cuda_${CUDA_VERSION_MAJOR_MINOR}_windows_network.exe"
+Invoke-WebRequest -Uri $cudaInstallerUrl -OutFile "cuda_installer.exe"
+Start-Process -FilePath ".\cuda_installer.exe" -ArgumentList "-s $CUDA_PACKAGES" -Wait
+Remove-Item "cuda_installer.exe" -Force
+
+$CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_MAJOR.$CUDA_MINOR"
+Write-Host "CUDA_PATH=$CUDA_PATH"
+$env:CUDA_PATH = $CUDA_PATH
+
+# If executing on github actions, emit the appropriate echo statements to update environment variables
+if (Test-Path "env:GITHUB_ACTIONS") {
+    # Set paths for subsequent steps, using $env:CUDA_PATH
+    Write-Host "Adding CUDA to CUDA_PATH, and PATH"
+    Add-Content -Path $env:GITHUB_ENV -Value "CUDA_PATH=$env:CUDA_PATH"
+    Add-Content -Path $env:GITHUB_ENV -Value "CUDA_PATH_V${CUDA_MAJOR}_${CUDA_MINOR}=$env:CUDA_PATH"
+    Add-Content -Path $env:GITHUB_PATH -Value "$env:CUDA_PATH\bin"
+}
diff --git a/scripts/actions/install-rocm-ubuntu.sh b/scripts/actions/install-rocm-ubuntu.sh
new file mode 100644
index 0000000..7039c8d
--- /dev/null
+++ b/scripts/actions/install-rocm-ubuntu.sh
@@ -0,0 +1,122 @@
+ROCM_PACKAGES_IN=(
+    rocm-hip-runtime-dev
+)
+
+function version_ge() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$2" ]
+}
+# returns 0 (true) if a > b
+function version_gt() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$1" = "$2" ] && return 1 || version_ge $1 $2
+}
+# returns 0 (true) if a <= b
+function version_le() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$1" ]
+}
+# returns 0 (true) if a < b
+function version_lt() {
+    [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1
+    [ "$1" = "$2" ] && return 1 || version_le $1 $2
+}
+
+
+LINUX_ID=$(lsb_release -si)
+LINUX_ID="${LINUX_ID,,}"
+
+LINUX_VERSION=$(lsb_release -sr)
+LINUX_VERSION="${LINUX_VERSION//.}"
+
+LINUX_CODENAME=$(lsb_release -cs)
+LINUX_CODENAME="${LINUX_CODENAME,,}"
+
+LOCATION_TEMP=${temp}
+
+ROCM_VERSION_MAJOR_MINOR=${rocm}
+
+CPU_ARCH=$(uname -m)
+if [[ "${CPU_ARCH}" == "aarch64" ]]
+then
+    CPU_ARCH="sbsa"
+fi
+
+
+ROCM_PACKAGES=""
+for package in "${ROCM_PACKAGES_IN[@]}"
+do : 
+    # Build the full package name and append to the string.
+    ROCM_PACKAGES+=" ${package}"
+done
+echo "ROCM_PACKAGES ${ROCM_PACKAGES}"
+
+GPG_FILENAME="rocm.gpg.key"
+GPG_URL="https://repo.radeon.com/rocm/${GPG_FILENAME}"
+REPO_URL="https://repo.radeon.com/rocm/apt/${rocm}/"
+
+is_root=false
+if (( $EUID == 0))
+then
+   is_root=true
+fi
+# Find if sudo is available
+has_sudo=false
+if command -v sudo &> /dev/null
+then
+    has_sudo=true
+fi
+# Decide if we can proceed or not (root or sudo is required) and if so store whether sudo should be used or not. 
+if [ "$is_root" = false ] && [ "$has_sudo" = false ]
+then 
+    echo "Root or sudo is required. Aborting."
+    exit 1
+elif [ "$is_root" = false ]
+then
+    USE_SUDO=sudo
+else
+    USE_SUDO=
+fi
+
+KEYRINGS_DIR=/etc/apt/keyrings
+
+if [ ! -e $KEYRINGS_DIR ]
+then
+    echo "Create directory: ${KEYRINGS_DIR}"
+    $USE_SUDO mkdir --parents --mode=0755 ${KEYRINGS_DIR}
+fi
+
+ROCM_GPG_KEYRING=${KEYRINGS_DIR}/rocm.gpg
+
+echo "Adding ROCm Repository:"
+wget ${GPG_URL} -O - | \
+    gpg --dearmor | $USE_SUDO tee ${ROCM_GPG_KEYRING} > /dev/null
+echo "deb [arch=amd64 signed-by=${ROCM_GPG_KEYRING}] ${REPO_URL} ${LINUX_CODENAME} main" \
+    | $USE_SUDO tee /etc/apt/sources.list.d/rocm.list
+echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
+    | $USE_SUDO tee /etc/apt/preferences.d/rocm-pin-600
+echo "Adding ROCm Repository completed."
+$USE_SUDO apt-get update
+
+$USE_SUDO apt-get -y install ${ROCM_PACKAGES}
+
+if [[ $? -ne 0 ]]
+then
+    echo "ROCm Installation Error."
+    exit 1
+fi
+
+ROCM_PATH=/opt/rocm-${rocm}
+echo "ROCM_PATH=${ROCM_PATH}"
+export ROCM_PATH=${ROCM_PATH}
+export PATH="$PATH:$ROCM_PATH/bin"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$ROCM_PATH/lib"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$ROCM_PATH/lib64"
+
+if [[ $GITHUB_ACTIONS ]]
+then
+    echo "Adding ROCM to ROCM_PATH, PATH and LD_LIBRARY_PATH"
+    echo "${ROCM_PATH}/bin" >> $GITHUB_PATH
+    echo "ROCM_PATH=${ROCM_PATH}" >> $GITHUB_ENV
+    echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+fi
diff --git a/scripts/packages-install/reinstall-cmake-rhel.sh b/scripts/packages-install/reinstall-cmake-rhel.sh
new file mode 100644
index 0000000..0b7ee15
--- /dev/null
+++ b/scripts/packages-install/reinstall-cmake-rhel.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
+#-------------------------------------------------------------------------------------------------------------
+#
+set -e
+
+CMAKE_VERSION=${1:-"none"}
+
+if [ "${CMAKE_VERSION}" = "none" ]; then
+    echo "No CMake version specified, skipping CMake reinstallation"
+    exit 0
+fi
+
+# Cleanup temporary directory and associated files when exiting the script.
+cleanup() {
+    EXIT_CODE=$?
+    set +e
+    if [[ -n "${TMP_DIR}" ]]; then
+        echo "Executing cleanup of tmp files"
+        rm -Rf "${TMP_DIR}"
+    fi
+    exit $EXIT_CODE
+}
+trap cleanup EXIT
+
+
+echo "Installing CMake..."
+dnf -y remove cmake
+dnf -y autoremove
+mkdir -p /opt/cmake
+
+architecture=$(arch)
+ARCH=${architecture}
+
+CMAKE_BINARY_NAME="cmake-${CMAKE_VERSION}-linux-${ARCH}.sh"
+CMAKE_CHECKSUM_NAME="cmake-${CMAKE_VERSION}-SHA-256.txt"
+TMP_DIR=$(mktemp -d -t cmake-XXXXXXXXXX)
+
+echo "${TMP_DIR}"
+cd "${TMP_DIR}"
+
+curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_BINARY_NAME}" -O
+curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_CHECKSUM_NAME}" -O
+
+sha256sum -c --ignore-missing "${CMAKE_CHECKSUM_NAME}"
+sh "${TMP_DIR}/${CMAKE_BINARY_NAME}" --prefix=/opt/cmake --skip-license
+
+ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
+ln -s /opt/cmake/bin/ctest /usr/local/bin/ctest
diff --git a/scripts/packages-install/reinstall-cmake-ubuntu.sh b/scripts/packages-install/reinstall-cmake-ubuntu.sh
new file mode 100644
index 0000000..408b81d
--- /dev/null
+++ b/scripts/packages-install/reinstall-cmake-ubuntu.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
+#-------------------------------------------------------------------------------------------------------------
+#
+set -e
+
+CMAKE_VERSION=${1:-"none"}
+
+if [ "${CMAKE_VERSION}" = "none" ]; then
+    echo "No CMake version specified, skipping CMake reinstallation"
+    exit 0
+fi
+
+# Cleanup temporary directory and associated files when exiting the script.
+cleanup() {
+    EXIT_CODE=$?
+    set +e
+    if [[ -n "${TMP_DIR}" ]]; then
+        echo "Executing cleanup of tmp files"
+        rm -Rf "${TMP_DIR}"
+    fi
+    exit $EXIT_CODE
+}
+trap cleanup EXIT
+
+
+echo "Installing CMake..."
+apt-get -y purge --auto-remove cmake
+mkdir -p /opt/cmake
+
+architecture=$(dpkg --print-architecture)
+case "${architecture}" in
+    arm64)
+        ARCH=aarch64 ;;
+    amd64)
+        ARCH=x86_64 ;;
+    *)
+        echo "Unsupported architecture ${architecture}."
+        exit 1
+        ;;
+esac
+
+CMAKE_BINARY_NAME="cmake-${CMAKE_VERSION}-linux-${ARCH}.sh"
+CMAKE_CHECKSUM_NAME="cmake-${CMAKE_VERSION}-SHA-256.txt"
+TMP_DIR=$(mktemp -d -t cmake-XXXXXXXXXX)
+
+echo "${TMP_DIR}"
+cd "${TMP_DIR}"
+
+curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_BINARY_NAME}" -O
+curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_CHECKSUM_NAME}" -O
+
+sha256sum -c --ignore-missing "${CMAKE_CHECKSUM_NAME}"
+sh "${TMP_DIR}/${CMAKE_BINARY_NAME}" --prefix=/opt/cmake --skip-license
+
+ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
+ln -s /opt/cmake/bin/ctest /usr/local/bin/ctest
diff --git a/src/tensor_array/core/data_type_wrapper.cu b/src/tensor-array/core/data_type_wrapper.cc
similarity index 86%
rename from src/tensor_array/core/data_type_wrapper.cu
rename to src/tensor-array/core/data_type_wrapper.cc
index 95d7cbe..478f0c6 100644
--- a/src/tensor_array/core/data_type_wrapper.cu
+++ b/src/tensor-array/core/data_type_wrapper.cc
@@ -32,12 +32,12 @@ limitations under the License.
 
 typedef __nv_bfloat16 bfloat16;
 
-#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3)
-#define USING_DATA_TYPE_NVIDIA_FLOAT (half)(bfloat16)
-#define USING_DATA_TYPE_FLOAT (float)(double)
-#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t)
-#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t)
-#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT USING_DATA_TYPE_NVIDIA_FLOAT
+#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3)
+#define USING_DATA_TYPE_NVIDIA_FLOAT() (half)(bfloat16)
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() USING_DATA_TYPE_NVIDIA_FLOAT()
 
 namespace tensor_array
 {
diff --git a/src/tensor_array/core/data_type_wrapper.hh b/src/tensor-array/core/data_type_wrapper.hh
similarity index 100%
rename from src/tensor_array/core/data_type_wrapper.hh
rename to src/tensor-array/core/data_type_wrapper.hh
diff --git a/src/tensor_array/core/devices.cu b/src/tensor-array/core/devices.cc
similarity index 99%
rename from src/tensor_array/core/devices.cu
rename to src/tensor-array/core/devices.cc
index c3a1694..fce05f8 100644
--- a/src/tensor_array/core/devices.cu
+++ b/src/tensor-array/core/devices.cc
@@ -14,11 +14,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
+#include <cuda_runtime.h>
 #include "devices.hh"
+#include <cstdio>
+#include <cstdlib>
 #include <cassert>
 #include <cstring>
 #include <mutex>
-#include <cuda_runtime.h>
 
 namespace tensor_array
 {
diff --git a/src/tensor_array/core/devices.hh b/src/tensor-array/core/devices.hh
similarity index 82%
rename from src/tensor_array/core/devices.hh
rename to src/tensor-array/core/devices.hh
index ca49598..e3aeb41 100644
--- a/src/tensor_array/core/devices.hh
+++ b/src/tensor-array/core/devices.hh
@@ -16,14 +16,14 @@ limitations under the License.
 
 #pragma once
 
-#ifdef __WIN32__
-#ifdef CUDA_ML_EXPORTS
-#define CUDA_ML_API __declspec(dllexport)
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
 #else
-#define CUDA_ML_API __declspec(dllimport)
+#define TENSOR_ARRAY_API __declspec(dllimport)
 #endif
 #else
-#define CUDA_ML_API
+#define TENSOR_ARRAY_API
 #endif
 
 namespace tensor_array
@@ -33,7 +33,7 @@ namespace tensor_array
 		enum DeviceType
 		{
 			CPU,
-			CUDA
+			CUDA,
 		};
 
 		struct Device
@@ -44,7 +44,7 @@ namespace tensor_array
 
 		constexpr Device DEVICE_CPU_0{ CPU,0 };
 
-		CUDA_ML_API Device& local_device();
+		TENSOR_ARRAY_API Device& local_device();
 
 		void device_memcpy(void*, Device, const void*, Device, size_t);
 
@@ -54,7 +54,7 @@ namespace tensor_array
 
 		void device_memset(void*, Device, int, size_t, void*);
 
-		CUDA_ML_API void device_CUDA_get_info();
+		TENSOR_ARRAY_API void device_CUDA_get_info();
 	}
 }
 
@@ -66,4 +66,4 @@ void operator delete(void*, tensor_array::devices::Device);
 
 void operator delete(void*, tensor_array::devices::Device, void*);
 
-#undef CUDA_ML_API
\ No newline at end of file
+#undef TENSOR_ARRAY_API
\ No newline at end of file
diff --git a/src/tensor_array/core/extern_type_map.cc b/src/tensor-array/core/extern_type_map.cc
similarity index 100%
rename from src/tensor_array/core/extern_type_map.cc
rename to src/tensor-array/core/extern_type_map.cc
diff --git a/src/tensor-array/core/extern_type_map.hh b/src/tensor-array/core/extern_type_map.hh
new file mode 100644
index 0000000..9e33356
--- /dev/null
+++ b/src/tensor-array/core/extern_type_map.hh
@@ -0,0 +1,41 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <unordered_map>
+#include <typeindex>
+
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_CORE_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
+namespace tensor_array
+{
+    namespace value
+    {
+        /**
+         * Map of data types.
+         */
+        extern TENSOR_ARRAY_API std::unordered_map<std::type_index, std::size_t> dynamic_type_size;
+    }
+}
+
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor-array/core/initializer_wrapper.hh b/src/tensor-array/core/initializer_wrapper.hh
new file mode 100644
index 0000000..8245f0b
--- /dev/null
+++ b/src/tensor-array/core/initializer_wrapper.hh
@@ -0,0 +1,96 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <initializer_list>
+
+namespace tensor_array
+{
+    namespace wrapper
+    {
+#ifdef _MSC_VER
+        template<class _E>
+        class initializer_wrapper: public std::initializer_list<_E>
+		{
+        public:
+            typedef _E 		value_type;
+            typedef const _E& 	reference;
+            typedef const _E& 	const_reference;
+            typedef size_t 		size_type;
+            typedef const _E* 	iterator;
+            typedef const _E* 	const_iterator;
+        public:
+            constexpr initializer_wrapper(const_iterator __a, size_type __l)
+            : std::initializer_list<_E>(__a, __a + __l) { }
+        
+            constexpr initializer_wrapper(const_iterator __begin, const_iterator __end)
+            : std::initializer_list<_E>(__begin, __end) { }
+        
+            constexpr initializer_wrapper() noexcept: std::initializer_list<_E>() { }
+		};
+#else
+        template<class _E>
+        class initializer_wrapper
+        {
+        public:
+            typedef _E 		value_type;
+            typedef const _E& 	reference;
+            typedef const _E& 	const_reference;
+            typedef size_t 		size_type;
+            typedef const _E* 	iterator;
+            typedef const _E* 	const_iterator;
+
+        private:
+#ifdef __GNUC__
+            iterator			_M_array;
+            size_type			_M_len;
+#endif
+
+        public:
+            constexpr initializer_wrapper(const_iterator __a, size_type __l)
+#ifdef __GNUC__
+            : _M_array(__a), _M_len(__l) { }
+#endif
+
+            constexpr initializer_wrapper(const_iterator __begin, const_iterator __end)
+            : _M_array(__begin), _M_len(__end - __begin)
+            { }
+        
+            constexpr initializer_wrapper() noexcept: _M_array(0), _M_len(0) { }
+        
+            // Number of elements.
+            constexpr size_type
+            size() const noexcept
+            {
+                return _M_len;
+            }
+        
+            // First element.
+            constexpr const_iterator
+            begin() const noexcept {
+                return _M_array;
+            }
+        
+            // One past the last element.
+            constexpr const_iterator
+            end() const noexcept {
+                return begin() + size();
+            }
+
+            constexpr operator std::initializer_list<_E>() const { return reinterpret_cast<const std::initializer_list<_E>&>(*this); }
+        };
+#endif // !_MSC_VER
+    }
+}
\ No newline at end of file
diff --git a/src/tensor_array/core/tensor.cc b/src/tensor-array/core/tensor.cc
similarity index 97%
rename from src/tensor_array/core/tensor.cc
rename to src/tensor-array/core/tensor.cc
index 9b85cac..7d10a75 100644
--- a/src/tensor_array/core/tensor.cc
+++ b/src/tensor-array/core/tensor.cc
@@ -14,23 +14,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#include <cassert>
-#include <thread>
-#include <forward_list>
-#include <ostream>
-#include <cstring>
-#include "data_type_wrapper.hh"
 #ifndef TENSOR_CONTENT
 #define TENSOR_CONTENT
 #include "tensor.hh"
 #undef TENSOR_CONTENT
 #endif // !TENSOR_CONTENT
+
+#include <thread>
+#include <forward_list>
 #include <unordered_set>
+#include <cassert>
+#include <cstring>
+#include "data_type_wrapper.hh"
 
-#define USING_DATA_TYPE_FLOAT (float)(double)
-#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t)
-#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t)
-#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT()
 
 #define LOOP(seq) END(A seq)
 #define BODY(x) ADD_CODE(x)
@@ -47,6 +47,16 @@ namespace tensor_array
     {
         bool use_grad = true;
 
+        bool is_use_grad()
+        {
+            return use_grad;
+		}
+
+        void set_use_grad(bool use)
+        {
+            use_grad = use;
+		}
+
         class Tensor::TensorContent
         {
         private:
@@ -140,7 +150,7 @@ namespace tensor_array
 
         void Tensor::TensorContent::reset_grad()
         {
-            std::lock_guard<std::mutex> tensor_lock(this->tensor_mutex);
+            std::lock_guard tensor_lock(this->tensor_mutex);
             this->grad = zeros<int>(this->buf.shape()).tensor_cast(this->buf.type()).get_buffer();
         }
 
@@ -159,7 +169,7 @@ namespace tensor_array
             this->TensorContent::calc_grad(grad);
             if (this->can_calc_grad && this->forward_back.empty())
             {
-                std::lock_guard<std::mutex> tensor_lock(this->tensor_mutex);
+                std::lock_guard tensor_lock(this->tensor_mutex);
                 std::forward_list<std::thread> thread_list;
                 for (auto& dat : this->derive_data)
                     if (this->derive_multithread)
@@ -174,7 +184,7 @@ namespace tensor_array
 
         void Tensor::TensorContent::calc_grad(const Tensor& grad)
         {
-            std::lock_guard<std::mutex> tensor_lock(this->tensor_mutex);
+            std::lock_guard tensor_lock(this->tensor_mutex);
             this->grad = add(this->grad, grad, false).get_buffer();
         }
 
@@ -468,8 +478,10 @@ temp_check_data_type = TEMP(temp.first) < TEMP(temp_tensor);
             return this->tensor_data.use_count();
         }
 
+        std::mutex calc_grad_mutex;
         void Tensor::calc_grad()
         {
+            std::lock_guard calc_grad_lock(calc_grad_mutex);
             this->tensor_data->reset_grad();
             this->tensor_data->calc_grad(values(this->get_buffer().shape(), 1.f).tensor_cast(this->get_buffer().type()));
         }
@@ -732,26 +744,6 @@ temp_check_data_type = TEMP(temp.first) < TEMP(temp_tensor);
             return divide(a, b);
         }
 
-        Tensor operator!=(const Tensor& a, const Tensor& b)
-        {
-            return a<b || a>b;
-        }
-
-        Tensor operator==(const Tensor& a, const Tensor& b)
-        {
-            return !(a != b);
-        }
-
-        Tensor operator>=(const Tensor& a, const Tensor& b)
-        {
-            return !(a < b);
-        }
-
-        Tensor operator<=(const Tensor& a, const Tensor& b)
-        {
-            return !(a > b);
-        }
-
         Tensor Tensor::exp() const
         {
             return this->exp(true);
@@ -902,6 +894,7 @@ out_stream << static_cast<TEMP>(tensor_out);
                 std::pair<Tensor, Tensor> broadcast_t = tensor_broadcasting(a, temp_b, 0, 2);
                 return batchedmatmul(broadcast_t.first, broadcast_t.second, true, nullptr);
             }
+            throw std::exception();
         }
 
         Tensor condition(const Tensor& value_bool, const Tensor& value_true, const Tensor& value_false)
diff --git a/src/tensor_array/core/tensor.hh b/src/tensor-array/core/tensor.hh
similarity index 77%
rename from src/tensor_array/core/tensor.hh
rename to src/tensor-array/core/tensor.hh
index 48b2670..af276d7 100644
--- a/src/tensor_array/core/tensor.hh
+++ b/src/tensor-array/core/tensor.hh
@@ -16,24 +16,15 @@ limitations under the License.
 
 #include <algorithm>
 #include <mutex>
+#include <ostream>
 #include "tensorbase.hh"
 
 #pragma once
 
-#ifdef __WIN32__
-#ifdef CUDA_ML_EXPORTS
-#define CUDA_ML_API __declspec(dllexport)
-#else
-#define CUDA_ML_API __declspec(dllimport)
-#endif
-#else
-#define CUDA_ML_API
-#endif
-
-#define USING_DATA_TYPE_FLOAT (float)(double)
-#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t)
-#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t)
-#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT()
 
 #define LOOP(seq) END(A seq)
 #define BODY(x) ADD_CODE(x)
@@ -44,11 +35,21 @@ limitations under the License.
 #define END(...) END_(__VA_ARGS__)
 #define END_(...) __VA_ARGS__##_END
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_CORE_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
 	namespace value
 	{
-        extern CUDA_ML_API bool use_grad;
+		extern TENSOR_ARRAY_API bool use_grad;
 
 #ifdef TENSOR_CONTENT
         void* create_mem_101(std::size_t s, const void* dat);
@@ -99,11 +100,17 @@ namespace tensor_array
                 dilation;
         };
 
+        class Tensor;
+
+        TENSOR_ARRAY_API Tensor tensor_rand(const std::initializer_list<unsigned int>&, unsigned int = std::rand());
+
+        TENSOR_ARRAY_API std::pair<Tensor, Tensor> tensor_broadcasting(const Tensor&, const Tensor&, unsigned char = 0, unsigned char = 0);
+
         /**
          * \brief Dynamic derivative tensor.
          * \brief This class use to calculate the tensor.
          */
-        class CUDA_ML_API Tensor
+        class TENSOR_ARRAY_API Tensor
         {
         public:
             /**
@@ -130,7 +137,7 @@ namespace tensor_array
             /**
              * \brief This class can iterate copy child tensor by index and derivate to parent tensor,
              */
-            class CUDA_ML_API Iterator
+            class TENSOR_ARRAY_API Iterator
             {
             public:
                 using iterator_category = std::forward_iterator_tag;
@@ -144,8 +151,8 @@ namespace tensor_array
                 Iterator& operator--();
                 Iterator operator++(int);
                 Iterator operator--(int);
-                friend bool CUDA_ML_API operator==(const Iterator&, const Iterator&);
-                friend bool CUDA_ML_API operator!=(const Iterator&, const Iterator&);
+                friend bool TENSOR_ARRAY_API operator==(const Iterator&, const Iterator&);
+                friend bool TENSOR_ARRAY_API operator!=(const Iterator&, const Iterator&);
             private:
                 unsigned long long index;
                 reference_left ref;
@@ -183,17 +190,10 @@ namespace tensor_array
             Tensor reshape(const std::vector<unsigned int>&) const;
             Tensor tensor_cast(const std::type_info&) const;
             Tensor conv_padding(const dimension&) const;
-#ifdef TENSOR_CONTENT
-            friend Tensor derive_transpose(const Tensor&, const Tensor&, bool, const DataBuffer&);
-
-            friend Tensor derive_reshape_cast(const Tensor&, const Tensor&, bool, const DataBuffer&);
-#endif
             Tensor transpose(unsigned char, unsigned char) const;
             std::pair<Tensor, Tensor> max(unsigned char = 0) const;
             std::pair<Tensor, Tensor> min(unsigned char = 0) const;
-            friend std::pair<Tensor, Tensor> tensor_broadcasting(const Tensor&, const Tensor&, unsigned char, unsigned char);
 #ifdef TENSOR_CONTENT
-            friend CUDA_ML_API Tensor add_dim(const std::vector<Tensor>&);
 #endif
             bool has_tensor() const;
             template<typename T>
@@ -225,10 +225,10 @@ namespace tensor_array
 
             Tensor& operator/=(const Tensor&);
 
-            friend CUDA_ML_API Tensor operator>(const Tensor&, const Tensor&);
-            friend CUDA_ML_API Tensor operator<(const Tensor&, const Tensor&);
-            friend CUDA_ML_API Tensor operator&&(const Tensor&, const Tensor&);
-            friend CUDA_ML_API Tensor operator||(const Tensor&, const Tensor&);
+            friend TENSOR_ARRAY_API Tensor operator>(const Tensor&, const Tensor&);
+            friend TENSOR_ARRAY_API Tensor operator<(const Tensor&, const Tensor&);
+            friend TENSOR_ARRAY_API Tensor operator&&(const Tensor&, const Tensor&);
+            friend TENSOR_ARRAY_API Tensor operator||(const Tensor&, const Tensor&);
             Tensor operator!();
             Tensor exp() const;
             Tensor sin() const;
@@ -238,11 +238,16 @@ namespace tensor_array
             Tensor cosh() const;
             Tensor tanh() const;
             Tensor sigmoid() const;
+            Tensor reduce_sum(unsigned char) const;
+            Tensor reduce_max(unsigned char) const;
+            Tensor reduce_min(unsigned char) const;
 
             Tensor log() const;
 #ifdef TENSOR_CONTENT
-            friend Tensor tensor_rand(const std::initializer_list<unsigned int>&, unsigned int);
+            friend Tensor derive_transpose(const Tensor&, const Tensor&, bool, const DataBuffer&);
 
+            friend Tensor derive_reshape_cast(const Tensor&, const Tensor&, bool, const DataBuffer&);
+            
             friend Tensor add(const Tensor&, const Tensor&, bool);
 
             friend Tensor power(const Tensor&, const Tensor&, bool);
@@ -264,7 +269,14 @@ namespace tensor_array
 
             Tensor tensor_cast(const std::type_info&, bool) const;
 #endif
-            friend CUDA_ML_API std::ostream& operator<<(std::ostream&, const Tensor&);
+
+            friend TENSOR_ARRAY_API Tensor add_dim(const std::vector<Tensor>&);
+
+            friend Tensor tensor_rand(const std::initializer_list<unsigned int>&, unsigned int/* = std::rand() */);
+
+            friend std::pair<Tensor, Tensor> tensor_broadcasting(const Tensor&, const Tensor&, unsigned char /*= 0*/, unsigned char /*= 0*/);
+
+            friend TENSOR_ARRAY_API std::ostream& operator<<(std::ostream&, const Tensor&);
 
         private:
 #ifdef TENSOR_CONTENT
@@ -294,7 +306,7 @@ namespace tensor_array
             std::shared_ptr<TensorContent> tensor_data;
         };
 
-        class CUDA_ML_API WeakTensor
+        class TENSOR_ARRAY_API WeakTensor
         {
         public:
             WeakTensor(const Tensor&);
@@ -304,13 +316,13 @@ namespace tensor_array
             std::weak_ptr<Tensor::TensorContent> tensor_data;
         };
 
-        CUDA_ML_API dimension operator+(const dimension&, const dimension&);
+        TENSOR_ARRAY_API dimension operator+(const dimension&, const dimension&);
 
-        CUDA_ML_API dimension operator-(const dimension&, const dimension&);
+        TENSOR_ARRAY_API dimension operator-(const dimension&, const dimension&);
 
-        CUDA_ML_API dimension operator*(const dimension&, const dimension&);
+        TENSOR_ARRAY_API dimension operator*(const dimension&, const dimension&);
 
-        CUDA_ML_API dimension operator/(const dimension&, const dimension&);
+        TENSOR_ARRAY_API dimension operator/(const dimension&, const dimension&);
 
         /**
          * \brief Plus 2 n-d tensors.
@@ -318,9 +330,9 @@ namespace tensor_array
          * \return
          * Tensor
          */
-        CUDA_ML_API Tensor operator+(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor operator+(const Tensor&, const Tensor&);
 
-        CUDA_ML_API Tensor operator-(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor operator-(const Tensor&, const Tensor&);
 
         /**
          * \brief Multiply 2 n-d tensors.
@@ -328,19 +340,19 @@ namespace tensor_array
          * \return
          * Tensor
          */
-        CUDA_ML_API Tensor operator*(const Tensor&, const Tensor&);
-
-        CUDA_ML_API Tensor operator/(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor operator!=(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor operator==(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor operator>=(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor operator<=(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor tensor_file_load(const char*);
-        CUDA_ML_API Tensor power(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor add(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor multiply(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor divide(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor dot(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor operator*(const Tensor&, const Tensor&);
+
+        TENSOR_ARRAY_API Tensor operator/(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor operator!=(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor operator==(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor operator>=(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor operator<=(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor tensor_file_load(const char*);
+        TENSOR_ARRAY_API Tensor power(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor add(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor multiply(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor divide(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor dot(const Tensor&, const Tensor&);
         /**
          * \brief Matrix multiplication 2 matrices.
          * \param a Matrix/Tensor that has size (batch*)m*k.
@@ -348,8 +360,8 @@ namespace tensor_array
          * \return Tensor - Matrix that has size (batch*)m*n.
          * \exception a.col != b.row 
          */
-        CUDA_ML_API Tensor matmul(const Tensor&, const Tensor&);
-        CUDA_ML_API Tensor condition(const Tensor&, const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor matmul(const Tensor&, const Tensor&);
+        TENSOR_ARRAY_API Tensor condition(const Tensor&, const Tensor&, const Tensor&);
         /**
          * \brief Convolution
          * \brief Only suport 1D, 2D, 3D convolution
@@ -360,17 +372,12 @@ namespace tensor_array
          * \return
          * Tensor (N, K, ...)
          */
-        CUDA_ML_API Tensor convolution(const Tensor&, const Tensor&, const dimension& = value::dimension(), const dimension& = value::dimension());
-        CUDA_ML_API std::pair<Tensor, Tensor> tensor_broadcasting(const Tensor&, const Tensor&, unsigned char = 0, unsigned char = 0);
-        CUDA_ML_API Tensor tensor_rand(const std::initializer_list<unsigned int>&, unsigned int = std::rand());
-#define ADD_CODE(TYPE) CUDA_ML_API Tensor values(const std::initializer_list<unsigned int>&, TYPE);
+        TENSOR_ARRAY_API Tensor convolution(const Tensor&, const Tensor&, const dimension& = value::dimension(), const dimension& = value::dimension());
+#define ADD_CODE(TYPE) TENSOR_ARRAY_API Tensor values(const std::initializer_list<unsigned int>&, TYPE);
         LOOP(USING_DATA_TYPE);
 #undef ADD_CODE
-#ifndef TENSOR_CONTENT
-        CUDA_ML_API Tensor add_dim(const std::vector<Tensor>&);
-#endif
-        CUDA_ML_API const std::type_info& comparison_type(const std::type_info&, const std::type_info&);
-        CUDA_ML_API Tensor tensor_rand(const std::vector<unsigned int>&, unsigned int = std::rand());
+        TENSOR_ARRAY_API const std::type_info& comparison_type(const std::type_info&, const std::type_info&);
+        TENSOR_ARRAY_API Tensor tensor_rand(const std::vector<unsigned int>&, unsigned int = std::rand());
 
 #ifdef TENSOR_CONTENT
         class Derivation
@@ -448,4 +455,4 @@ struct std::equal_to<tensor_array::value::Tensor>
 #undef USING_DATA_TYPE_SINT
 #undef USING_DATA_TYPE_UINT
 
-#undef CUDA_ML_API
\ No newline at end of file
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/core/tensor_blas.cu b/src/tensor-array/core/tensor_blas.cc
similarity index 98%
rename from src/tensor_array/core/tensor_blas.cu
rename to src/tensor-array/core/tensor_blas.cc
index a260ccb..1bb16e4 100644
--- a/src/tensor_array/core/tensor_blas.cu
+++ b/src/tensor-array/core/tensor_blas.cc
@@ -207,7 +207,10 @@ namespace tensor_array
 				c_ptr, convert_cuda_type(c_type), shape_a.end()[-2], 1, batch_size,
 				convert_cuda_type(c_type), CUBLAS_GEMM_DEFAULT);
 			blasStat = cublasDestroy(blasHandle);
-			TensorBase value_buf(c_type, { batch_size, shape_a.end()[-2] , shape_b.end()[-1] }, c_ptr, this_cuda);
+            std::vector<unsigned int> out_dims = shape_a;
+            out_dims[out_dims.size() - 1] = shape_b.end()[-1];
+
+			TensorBase value_buf(c_type, out_dims, c_ptr, this_cuda);
 			cudaStat = cudaFree(c_ptr);
 			return Tensor(std::move(value_buf), std::move(temp));
 		}
diff --git a/src/tensor_array/core/tensor_cast.cu b/src/tensor-array/core/tensor_cast.cu
similarity index 76%
rename from src/tensor_array/core/tensor_cast.cu
rename to src/tensor-array/core/tensor_cast.cu
index 3c3da08..7749258 100644
--- a/src/tensor_array/core/tensor_cast.cu
+++ b/src/tensor-array/core/tensor_cast.cu
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cuda_fp8.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cstdio>
 #ifndef TENSOR_CONTENT
 #define TENSOR_CONTENT
 #include "tensor.hh"
@@ -34,22 +35,39 @@ limitations under the License.
 #define END(...) END_(__VA_ARGS__)
 #define END_(...) __VA_ARGS__##_END
 
-#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3)
-#define USING_DATA_TYPE_NVIDIA_FLOAT (__half)(__nv_bfloat16)
-#define USING_DATA_TYPE_FLOAT (float)(double)
-#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t)
-#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e4m3)(__nv_fp8_e5m2)
+#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16)
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint16_t)(uint32_t)(uint64_t)
 
-#define USING_DATA_TYPE_CAST_TO \
-(bool) \
-USING_DATA_TYPE_SINT \
-USING_DATA_TYPE_UINT \
-USING_DATA_TYPE_FLOAT \
-USING_DATA_TYPE_NVIDIA_FLOAT
+#if CUDART_VERSION >= 12020
+#define USING_DATA_TYPE_CAST_FROM() \
+USING_DATA_TYPE_SINT() \
+USING_DATA_TYPE_UINT() \
+USING_DATA_TYPE_FLOAT() \
+USING_DATA_TYPE_NVIDIA_FLOAT()
+#else
+#define USING_DATA_TYPE_CAST_FROM() \
+USING_DATA_TYPE_SINT() \
+USING_DATA_TYPE_UINT() \
+USING_DATA_TYPE_FLOAT()
+#endif
 
-#define USING_DATA_TYPE_CAST_FROM \
-USING_DATA_TYPE_CAST_TO  \
-USING_DATA_TYPE_NVIDIA_FLOAT_8
+#if CUDART_VERSION >= 12020
+#define USING_DATA_TYPE_CAST_TO() \
+(bool) \
+(int8_t) \
+(uint8_t) \
+USING_DATA_TYPE_NVIDIA_FLOAT_8() \
+USING_DATA_TYPE_CAST_FROM()
+#else
+#define USING_DATA_TYPE_CAST_TO() \
+(bool) \
+(int8_t) \
+(uint8_t) \
+USING_DATA_TYPE_CAST_FROM()
+#endif
 
 namespace tensor_array
 {
@@ -85,13 +103,13 @@ namespace tensor_array
 #define ADD_CODE(TYPE) \
 if(this->get_buffer().type() == typeid(TYPE)) \
 type_casting<<<grid_dim, block_dim>>>(out_ptr, static_cast<const TYPE*>(base_of_this.data()), total_size);
-			LOOP(USING_DATA_TYPE_CAST_TO);
+			LOOP(USING_DATA_TYPE_CAST_FROM());
 #undef ADD_CODE
 			cuda_status = cudaDeviceSynchronize();
 			cuda_status = cudaGetLastError();
 			if (cuda_status != cudaSuccess)
 			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
 			}
 			std::type_index test = typeid(T);
 			if (dynamic_type_size.find(test) == dynamic_type_size.end())
@@ -108,7 +126,7 @@ type_casting<<<grid_dim, block_dim>>>(out_ptr, static_cast<const TYPE*>(base_of_
 #define ADD_CODE(TYPE) \
 if(dtype == typeid(TYPE)) \
 return this->cast<TYPE>(is_derive);
-			LOOP(USING_DATA_TYPE_CAST_FROM);
+			LOOP(USING_DATA_TYPE_CAST_TO());
 #undef ADD_CODE
 			throw std::exception();
 		}
diff --git a/src/tensor_array/core/tensor_convolution.cc b/src/tensor-array/core/tensor_convolution.cc
similarity index 100%
rename from src/tensor_array/core/tensor_convolution.cc
rename to src/tensor-array/core/tensor_convolution.cc
diff --git a/src/tensor_array/core/tensor_convolution.cu b/src/tensor-array/core/tensor_convolution.cu
similarity index 95%
rename from src/tensor_array/core/tensor_convolution.cu
rename to src/tensor-array/core/tensor_convolution.cu
index ae2489f..6b3217d 100644
--- a/src/tensor_array/core/tensor_convolution.cu
+++ b/src/tensor-array/core/tensor_convolution.cu
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <cassert>
+#include <cstdio>
 #include <cuda_fp8.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
@@ -27,29 +28,29 @@ limitations under the License.
 #endif
 
 #if __CUDA_ARCH__ >= 800
-#define USE_BF16 (__nv_bfloat16)
+#define USE_BF16() (__nv_bfloat16)
 #else
-#define USE_BF16
+#define USE_BF16()
 #endif
 
 #if __CUDA_ARCH__ >= 700
-#define USE_FP16 (__half)
+#define USE_FP16() (__half)
 #else
-#define USE_FP16
+#define USE_FP16()
 #endif
 
 #if __CUDA_ARCH__ >= 600
-#define USE_FP64 (double)
+#define USE_FP64() (double)
 #else
-#define USE_FP64
+#define USE_FP64()
 #endif
 
-#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3)
-#define USING_DATA_TYPE_NVIDIA_FLOAT USE_FP16 USE_BF16
-#define USING_DATA_TYPE_FLOAT (float)USE_FP64
-#define USING_DATA_TYPE_SINT (int32_t)
-#define USING_DATA_TYPE_UINT (uint32_t)(unsigned long long int)
-#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT USING_DATA_TYPE_NVIDIA_FLOAT
+#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3)
+#define USING_DATA_TYPE_NVIDIA_FLOAT() USE_FP16() USE_BF16()
+#define USING_DATA_TYPE_FLOAT() (float) USE_FP64()
+#define USING_DATA_TYPE_SINT() (int32_t)
+#define USING_DATA_TYPE_UINT() (uint32_t)(unsigned long long int)
+#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() USING_DATA_TYPE_NVIDIA_FLOAT()
 
 #define LOOP(seq) END(A seq)
 #define BODY(x) ADD_CODE(x)
@@ -178,7 +179,7 @@ kernel_derive_conv_padding<<<grid_dim, block_dim>>>(static_cast<TYPE*>(out_ptr),
 			cuda_status = cudaGetLastError();
 			if (cuda_status != cudaSuccess)
 			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
 			}
 			TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda);
 			cuda_status = cudaFree(out_ptr);
@@ -262,7 +263,7 @@ kernel_conv_padding<<<grid_dim, block_dim>>>(static_cast<TYPE*>(out_ptr), static
 			cuda_status = cudaGetLastError();
 			if (cuda_status != cudaSuccess)
 			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
 			}
 			TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda);
 			cuda_status = cudaFree(out_ptr);
@@ -390,7 +391,7 @@ kernel_col2im<<<grid_dim, block_dim>>>(static_cast<TYPE*>(out_ptr), static_cast<
 			cuda_status = cudaGetLastError();
 			if (cuda_status != cudaSuccess)
 			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
 			}
 			TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda);
 			cuda_status = cudaFree(out_ptr);
@@ -514,7 +515,7 @@ kernel_im2col<<<grid_dim, block_dim>>>(static_cast<TYPE*>(out_ptr), static_cast<
 			cuda_status = cudaGetLastError();
 			if (cuda_status != cudaSuccess)
 			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
 			}
 			TensorBase value_buf(a.get_buffer().type(), new_shape, out_ptr, this_cuda);
 			cuda_status = cudaFree(out_ptr);
diff --git a/src/tensor_array/core/tensor.cu b/src/tensor-array/core/tensor_math_func.cu
similarity index 62%
rename from src/tensor_array/core/tensor.cu
rename to src/tensor-array/core/tensor_math_func.cu
index 9db96a8..5ccef02 100644
--- a/src/tensor_array/core/tensor.cu
+++ b/src/tensor-array/core/tensor_math_func.cu
@@ -28,15 +28,16 @@ limitations under the License.
 #ifndef TENSOR_CONTENT
 #define TENSOR_CONTENT
 #include "tensor.hh"
+#include "tensor_math_op.hh"
 #undef TENSOR_CONTENT
 #endif // !TENSOR_CONTENT
 
-#define USING_DATA_TYPE_NVIDIA_FLOAT_8 (__nv_fp8_e5m2)(__nv_fp8_e4m3)
-#define USING_DATA_TYPE_NVIDIA_FLOAT (__half)(__nv_bfloat16)
-#define USING_DATA_TYPE_FLOAT (float)(double)
-#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t)
-#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t)
-#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT USING_DATA_TYPE_NVIDIA_FLOAT
+#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3)
+#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16)
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT() USING_DATA_TYPE_NVIDIA_FLOAT()
 
 #define LOOP(seq) END(A seq)
 #define BODY(x) ADD_CODE(x)
@@ -70,30 +71,6 @@ namespace tensor_array
 				value_arr[thread_x] = curand_uniform(&thisState);
 		}
 
-		template <typename T>
-		__global__ void sum_2_arr(T c[], const T a[], const T b[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				c[thread_x] = a[thread_x] + b[thread_x];
-		}
-
-		template <typename T>
-		__global__ void mul_2_arr(T c[], const T a[], const T b[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				c[thread_x] = a[thread_x] * b[thread_x];
-		}
-
-		template <typename T>
-		__global__ void div_2_arr(T c[], const T a[], const T b[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				c[thread_x] = a[thread_x] / b[thread_x];
-		}
-
 		__global__ void exp_arr(float value_out[], const float value_in[], unsigned int c_size)
 		{
 			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -206,14 +183,6 @@ namespace tensor_array
 				value_out[thread_x] = tanh(value_in[thread_x]);
 		}
 
-		template <typename T>
-		__global__ void sigmoid_arr(T value_out[], const T value_in[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				value_out[thread_x] = T(1) / (T(1) + T(exp(double(-value_in[thread_x]))));
-		}
-
 		template <typename T>
 		__global__ void pow_arr(T value_out[], const T a[], const T b[], unsigned int c_size)
 		{
@@ -225,45 +194,6 @@ namespace tensor_array
 					value_out[thread_x] = powf(a[thread_x], b[thread_x]);
 		}
 
-		template <typename T>
-		__global__ void arr_more_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				out_value[thread_x] = in1_value[thread_x] > in2_value[thread_x];
-		}
-
-		template <typename T>
-		__global__ void arr_less_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				out_value[thread_x] = in1_value[thread_x] < in2_value[thread_x];
-		}
-
-		__global__ void arr_logical_and(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				out_value[thread_x] = in1_value[thread_x] && in2_value[thread_x];
-		}
-
-
-		__global__ void arr_logical_or(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				out_value[thread_x] = in1_value[thread_x] || in2_value[thread_x];
-		}
-
-
-		__global__ void arr_logical_not(bool out_value[], const bool in1_value[], unsigned int c_size)
-		{
-			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
-			if (thread_x < c_size)
-				out_value[thread_x] = !in1_value[thread_x];
-		}
-
 		template <typename T>
 		__global__ void array_condition(T out_value[], unsigned int c_size, const bool bool_value[], const T true_value[], const T false_value[])
 		{
@@ -323,7 +253,7 @@ namespace tensor_array
 			cudaStat = cudaGetLastError();
 			if (cudaStat != cudaSuccess)
 			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cudaStat));
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cudaStat));
 			}
 			TensorBase value_buf(a.get_buffer().type(), { shape_a.begin()[0], shape_a.begin()[2], shape_a.begin()[1], shape_a.end()[-1]}, c_ptr, this_cuda);
 			cudaStat = cudaFree(c_ptr);
@@ -381,197 +311,13 @@ return values0<TYPE>(list_dim, value); \
 			cudaStatus = cudaGetLastError();
 			if (cudaStatus != cudaSuccess)
 			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cudaStatus));
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cudaStatus));
 			}
 			TensorBase other_buf(typeid(float), list_dim, dev_ptr, this_cuda);
 			cudaStatus = cudaFree(dev_ptr);
 			return other_buf;
 		}
 
-		Tensor operator>(const Tensor& a, const Tensor& b)
-		{
-			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
-			cudaError cuda_status;
-			bool* c_ptr;
-			Device this_cuda{ CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_a = a.get_buffer().change_device(this_cuda);
-			TensorBase base_b = b.get_buffer().change_device(this_cuda);
-			std::size_t c_size = std::max
-			(
-				base_a.data_size() / get_sizeof_type(base_a.type()),
-				base_b.data_size() / get_sizeof_type(base_b.type())
-			);
-			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
-#define ADD_CODE(TYPE) \
-if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
-arr_more_than<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
-			LOOP(USING_DATA_TYPE);
-#undef ADD_CODE
-			cuda_status = cudaDeviceSynchronize();
-			cuda_status = cudaGetLastError();
-			if (cuda_status != cudaSuccess)
-			{
-				printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
-			}
-			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
-			cuda_status = cudaFree(c_ptr);
-			return other_buf;
-		}
-
-		Tensor operator<(const Tensor& a, const Tensor& b)
-		{
-			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
-			cudaError cuda_status;
-			bool* c_ptr;
-			Device this_cuda{CUDA};
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_a = a.get_buffer().change_device(this_cuda);
-			TensorBase base_b = b.get_buffer().change_device(this_cuda);
-			std::size_t c_size = std::max
-			(
-				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
-				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
-			);
-			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
-#define ADD_CODE(TYPE) \
-if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
-arr_less_than<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
-			LOOP(USING_DATA_TYPE);
-#undef ADD_CODE
-			cuda_status = cudaDeviceSynchronize();
-			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
-			cuda_status = cudaFree(c_ptr);
-			return other_buf;
-		}
-
-		Tensor operator&&(const Tensor& a, const Tensor& b)
-		{
-			assert(
-				equal_dim_size(a.get_buffer(), b.get_buffer())
-				&& a.get_buffer().type() == typeid(bool)
-				&& b.get_buffer().type() == typeid(bool)
-			);
-			cudaError cuda_status;
-			bool* c_ptr;
-			devices::Device this_cuda{ devices::CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_a = a.get_buffer().change_device(this_cuda);
-			TensorBase base_b = b.get_buffer().change_device(this_cuda);
-			std::size_t c_size = std::max
-			(
-				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
-				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
-			);
-			cuda_status = cudaMalloc(&c_ptr, c_size);
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
-			arr_logical_and << <grid_dim, block_dim >> > (c_ptr, static_cast<const bool*>(base_a.data()), static_cast<const bool*>(base_b.data()), c_size);
-			cuda_status = cudaDeviceSynchronize();
-			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
-			cuda_status = cudaFree(c_ptr);
-			return other_buf;
-		}
-
-		Tensor operator||(const Tensor& a, const Tensor& b)
-		{
-			assert(
-				equal_dim_size(a.get_buffer(), b.get_buffer())
-				&& a.get_buffer().type() == typeid(bool)
-				&& b.get_buffer().type() == typeid(bool)
-			);
-			cudaError cuda_status;
-			bool* c_ptr;
-			devices::Device this_cuda{ devices::CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_a = a.get_buffer().change_device(this_cuda);
-			TensorBase base_b = b.get_buffer().change_device(this_cuda);
-			std::size_t c_size = std::max
-			(
-				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
-				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
-			);
-			cuda_status = cudaMalloc(&c_ptr, c_size);
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
-			arr_logical_or << <grid_dim, block_dim >> > (c_ptr, static_cast<const bool*>(base_a.data()), static_cast<const bool*>(base_b.data()), c_size);
-			cuda_status = cudaDeviceSynchronize();
-			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
-			cuda_status = cudaFree(c_ptr);
-			return other_buf;
-		}
-
-		Tensor Tensor::operator!()
-		{
-			assert(this->get_buffer().type() == typeid(bool));
-			cudaError cuda_status;
-			bool* out_ptr;
-			devices::Device this_cuda{ devices::CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_of_this = this->get_buffer().change_device(this_cuda);
-			cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size());
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(this->get_buffer().data_size() / block_dim.x + 1U);
-			arr_logical_not << < grid_dim, block_dim >> > (out_ptr, static_cast<const bool*>(base_of_this.data()), this->get_buffer().data_size());
-			cuda_status = cudaDeviceSynchronize();
-			TensorBase other_buf(typeid(bool), this->get_buffer().shape(), out_ptr, this_cuda);
-			cuda_status = cudaFree(out_ptr);
-			return other_buf;
-		}
-
-		Tensor multiply(const Tensor& a, const Tensor& b, bool is_derive, const DataBuffer&)
-		{
-			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
-			std::vector<std::pair<Tensor, Derivation>> temp;
-			if (is_derive)
-			{
-				temp.push_back(std::make_pair(a, Derivation(b.clone(), multiply)));
-				temp.push_back(std::make_pair(b, Derivation(a.clone(), multiply)));
-			}
-			cudaError cuda_status;
-			TensorBase other_buf;
-			void* c_ptr;
-			devices::Device this_cuda{ devices::CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_a = a.get_buffer().change_device(this_cuda);
-			TensorBase base_b = b.get_buffer().change_device(this_cuda);
-			std::size_t c_size = std::max
-			(
-				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
-				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
-			);
-			cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size()));
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
-#define ADD_CODE(TYPE) \
-if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
-{ \
-mul_2_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size); \
-cuda_status = cudaDeviceSynchronize(); \
-other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \
-}
-			LOOP(USING_DATA_TYPE);
-#undef ADD_CODE
-			cuda_status = cudaFree(c_ptr);
-			return Tensor(std::move(other_buf), std::move(temp));
-		}
-
 		Tensor condition(const Tensor& bool_value, const Tensor& true_value, const Tensor& false_value, bool is_derive)
 		{
 			assert(
@@ -615,84 +361,6 @@ other_buf = TensorBase(typeid(TYPE), bool_value.get_buffer().shape(), ptr_out, t
 			return Tensor(std::move(other_buf), std::move(temp));
 		}
 
-		Tensor add(const Tensor& a, const Tensor& b, bool is_derive)
-		{
-			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
-			std::vector<std::pair<Tensor, Derivation>> temp;
-			if (is_derive)
-			{
-				temp.push_back(std::make_pair(a, Derivation(values(a.get_buffer().shape(), 1).tensor_cast(a.get_buffer().type(), false), multiply)));
-				temp.push_back(std::make_pair(b, Derivation(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), multiply)));
-			}
-			cudaError cuda_status;
-			TensorBase other_buf;
-			void* c_ptr;
-			devices::Device this_cuda{ devices::CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_a = a.get_buffer().change_device(this_cuda);
-			TensorBase base_b = b.get_buffer().change_device(this_cuda);
-			std::size_t c_size = std::max
-			(
-				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
-				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
-			);
-			cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size()));
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
-#define ADD_CODE(TYPE) \
-if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
-{ \
-sum_2_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size); \
-cuda_status = cudaDeviceSynchronize(); \
-other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \
-}
-			LOOP(USING_DATA_TYPE);
-#undef ADD_CODE
-			cuda_status = cudaFree(c_ptr);
-			return Tensor(std::move(other_buf), std::move(temp));
-		}
-
-		Tensor divide(const Tensor& a, const Tensor& b, bool is_derive)
-		{
-			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
-			std::vector<std::pair<Tensor, Derivation>> temp;
-			if (is_derive)
-			{
-				temp.push_back(std::make_pair(a, Derivation(divide(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), b, false), multiply)));
-				temp.push_back(std::make_pair(b, Derivation(divide(a, power(b, values(b.get_buffer().shape(), 2).tensor_cast(b.get_buffer().type(), false), false), false), multiply)));
-			}
-			cudaError cuda_status;
-			TensorBase other_buf;
-			void* c_ptr;
-			devices::Device this_cuda{ devices::CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_a = a.get_buffer().change_device(this_cuda);
-			TensorBase base_b = b.get_buffer().change_device(this_cuda);
-			std::size_t c_size = std::max
-			(
-				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
-				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
-			);
-			cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size()));
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
-#define ADD_CODE(TYPE) \
-if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
-{ \
-div_2_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size); \
-cuda_status = cudaDeviceSynchronize(); \
-other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \
-}
-			LOOP(USING_DATA_TYPE);
-#undef ADD_CODE
-			cuda_status = cudaFree(c_ptr);
-			return Tensor(std::move(other_buf), std::move(temp));
-		}
-
 		Tensor power(const Tensor& a, const Tensor& b, bool is_derive)
 		{
 			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
@@ -956,39 +624,6 @@ tanh_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(out_ptr), static_cast<const
 			cuda_status = cudaFree(out_ptr);
 			return Tensor(std::move(other_buf), std::move(temp));
 		}
-
-		Tensor Tensor::sigmoid(bool is_derive) const
-		{
-			std::vector<std::pair<Tensor, Derivation>> temp;
-			if (is_derive)
-			{
-				Tensor temp_ones = values(this->get_buffer().shape(), 1.f).tensor_cast(this->get_buffer().type(), false);
-				Tensor temp_sigmoid = this->sigmoid(false);
-				temp.push_back(std::make_pair(*this, Derivation(multiply(temp_sigmoid, add(temp_ones, -temp_sigmoid, false), false, DataBuffer()), multiply)));
-			}
-			cudaError cuda_status;
-			void* out_ptr;
-			devices::Device this_cuda{ devices::CUDA };
-			cuda_status = cudaGetDevice(&this_cuda.index);
-			cudaDeviceProp cu_dev_prop;
-			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
-			TensorBase base_of_this = this->get_buffer().change_device(this_cuda);
-			cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size());
-			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
-			std::size_t out_size = this->get_buffer().data_size() / get_sizeof_type(this->get_buffer().type());
-			dim3 grid_dim(out_size / block_dim.x + ((out_size % block_dim.x) ? 1U : 0U));
-#define ADD_CODE(TYPE) \
-if(this->get_buffer().type() == typeid(TYPE)) \
-sigmoid_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(out_ptr), static_cast<const TYPE*>(base_of_this.data()), out_size);
-			LOOP(USING_DATA_TYPE);
-#undef ADD_CODE
-			cuda_status = cudaDeviceSynchronize();
-			TensorBase other_buf(this->get_buffer().type(), this->get_buffer().shape(), out_ptr, this_cuda);
-			cuda_status = cudaFree(out_ptr);
-			return Tensor(std::move(other_buf), std::move(temp));
-		}
-
-		
     }
 }
 
diff --git a/src/tensor-array/core/tensor_math_op.cu b/src/tensor-array/core/tensor_math_op.cu
new file mode 100644
index 0000000..2514095
--- /dev/null
+++ b/src/tensor-array/core/tensor_math_op.cu
@@ -0,0 +1,602 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <curand_kernel.h>
+#include <cmath>
+#include <cstring>
+#include <cstdio>
+#include <cassert>
+#include <exception>
+#include <cuda_fp8.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#ifndef TENSOR_CONTENT
+#define TENSOR_CONTENT
+#include "tensor.hh"
+#include "tensor_math_op.hh"
+#undef TENSOR_CONTENT
+#endif // !TENSOR_CONTENT
+
+#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3)
+#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16)
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#if CUDART_VERSION >= 12020
+#define USING_DATA_TYPE \
+USING_DATA_TYPE_SINT() \
+USING_DATA_TYPE_UINT() \
+USING_DATA_TYPE_FLOAT() \
+USING_DATA_TYPE_NVIDIA_FLOAT()
+#else
+#define USING_DATA_TYPE \
+USING_DATA_TYPE_SINT() \
+USING_DATA_TYPE_UINT() \
+USING_DATA_TYPE_FLOAT()
+#endif
+
+#define LOOP(seq) END(A seq)
+#define BODY(x) ADD_CODE(x)
+#define A(x) BODY(x) B
+#define B(x) BODY(x) A
+#define A_END
+#define B_END
+#define END(...) END_(__VA_ARGS__)
+#define END_(...) __VA_ARGS__##_END
+
+namespace tensor_array
+{
+    namespace value
+    {
+		using namespace devices;
+
+		template <typename T>
+		__global__ void sum_2_arr(T c[], const T a[], const T b[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				c[thread_x] = a[thread_x] + b[thread_x];
+		}
+
+		template <typename T>
+		__global__ void mul_2_arr(T c[], const T a[], const T b[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				c[thread_x] = a[thread_x] * b[thread_x];
+		}
+
+		template <typename T>
+		__global__ void div_2_arr(T c[], const T a[], const T b[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				c[thread_x] = a[thread_x] / b[thread_x];
+		}
+
+		template <typename T>
+		__global__ void arr_greater_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] > in2_value[thread_x];
+		}
+
+        template <typename T>
+		__global__ void arr_greater_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] >= in2_value[thread_x];
+		}
+
+		template <typename T>
+		__global__ void arr_less_than(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] < in2_value[thread_x];
+		}
+
+        template <typename T>
+		__global__ void arr_less_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] <= in2_value[thread_x];
+		}
+
+        template <typename T>
+		__global__ void arr_equal_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] == in2_value[thread_x];
+		}
+
+        template <typename T>
+		__global__ void arr_not_equal(bool out_value[], const T in1_value[], const T in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] != in2_value[thread_x];
+		}
+
+		__global__ void arr_logical_and(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] && in2_value[thread_x];
+		}
+
+
+		__global__ void arr_logical_or(bool out_value[], const bool in1_value[], const bool in2_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = in1_value[thread_x] || in2_value[thread_x];
+		}
+
+
+		__global__ void arr_logical_not(bool out_value[], const bool in1_value[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				out_value[thread_x] = !in1_value[thread_x];
+		}
+
+        template <typename T>
+		__global__ void sigmoid_arr(T value_out[], const T value_in[], unsigned int c_size)
+		{
+			unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+			if (thread_x < c_size)
+				value_out[thread_x] = T(1) / (T(1) + T(exp(double(-value_in[thread_x]))));
+		}
+
+		bool equal_dim_size(const TensorBase& a, const TensorBase& b);
+
+		Tensor operator>(const Tensor& a, const Tensor& b)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			cudaError cuda_status;
+			bool* c_ptr;
+			Device this_cuda{ CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				base_a.data_size() / get_sizeof_type(base_a.type()),
+				base_b.data_size() / get_sizeof_type(base_b.type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+arr_greater_than<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaDeviceSynchronize();
+			cuda_status = cudaGetLastError();
+			if (cuda_status != cudaSuccess)
+			{
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
+			}
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+        Tensor operator>=(const Tensor& a, const Tensor& b)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			cudaError cuda_status;
+			bool* c_ptr;
+			Device this_cuda{ CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				base_a.data_size() / get_sizeof_type(base_a.type()),
+				base_b.data_size() / get_sizeof_type(base_b.type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+arr_greater_equal<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaDeviceSynchronize();
+			cuda_status = cudaGetLastError();
+			if (cuda_status != cudaSuccess)
+			{
+				std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
+			}
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+		Tensor operator<(const Tensor& a, const Tensor& b)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			cudaError cuda_status;
+			bool* c_ptr;
+			Device this_cuda{CUDA};
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+arr_less_than<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+        Tensor operator<=(const Tensor& a, const Tensor& b)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			cudaError cuda_status;
+			bool* c_ptr;
+			Device this_cuda{CUDA};
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+arr_less_equal<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+        Tensor operator==(const Tensor& a, const Tensor& b)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			cudaError cuda_status;
+			bool* c_ptr;
+			Device this_cuda{CUDA};
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+arr_equal_equal<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+        Tensor operator!=(const Tensor& a, const Tensor& b)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			cudaError cuda_status;
+			bool* c_ptr;
+			Device this_cuda{CUDA};
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size * sizeof(bool));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+arr_not_equal<<<grid_dim, block_dim>>>(c_ptr, static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size);
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+		Tensor operator&&(const Tensor& a, const Tensor& b)
+		{
+			assert(
+				equal_dim_size(a.get_buffer(), b.get_buffer())
+				&& a.get_buffer().type() == typeid(bool)
+				&& b.get_buffer().type() == typeid(bool)
+			);
+			cudaError cuda_status;
+			bool* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size);
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+			arr_logical_and << <grid_dim, block_dim >> > (c_ptr, static_cast<const bool*>(base_a.data()), static_cast<const bool*>(base_b.data()), c_size);
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+		Tensor operator||(const Tensor& a, const Tensor& b)
+		{
+			assert(
+				equal_dim_size(a.get_buffer(), b.get_buffer())
+				&& a.get_buffer().type() == typeid(bool)
+				&& b.get_buffer().type() == typeid(bool)
+			);
+			cudaError cuda_status;
+			bool* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, c_size);
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+			arr_logical_or << <grid_dim, block_dim >> > (c_ptr, static_cast<const bool*>(base_a.data()), static_cast<const bool*>(base_b.data()), c_size);
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(typeid(bool), a.get_buffer().shape(), c_ptr, this_cuda);
+			cuda_status = cudaFree(c_ptr);
+			return other_buf;
+		}
+
+		Tensor Tensor::operator!()
+		{
+			assert(this->get_buffer().type() == typeid(bool));
+			cudaError cuda_status;
+			bool* out_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_of_this = this->get_buffer().change_device(this_cuda);
+			cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size());
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(this->get_buffer().data_size() / block_dim.x + 1U);
+			arr_logical_not << < grid_dim, block_dim >> > (out_ptr, static_cast<const bool*>(base_of_this.data()), this->get_buffer().data_size());
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(typeid(bool), this->get_buffer().shape(), out_ptr, this_cuda);
+			cuda_status = cudaFree(out_ptr);
+			return other_buf;
+		}
+
+		Tensor multiply(const Tensor& a, const Tensor& b, bool is_derive, const DataBuffer&)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			std::vector<std::pair<Tensor, Derivation>> temp;
+			if (is_derive)
+			{
+				temp.push_back(std::make_pair(a, Derivation(b.clone(), multiply)));
+				temp.push_back(std::make_pair(b, Derivation(a.clone(), multiply)));
+			}
+			cudaError cuda_status;
+			TensorBase other_buf;
+			void* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size()));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+{ \
+mul_2_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size); \
+cuda_status = cudaDeviceSynchronize(); \
+other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \
+}
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaFree(c_ptr);
+			return Tensor(std::move(other_buf), std::move(temp));
+		}
+
+		Tensor add(const Tensor& a, const Tensor& b, bool is_derive)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			std::vector<std::pair<Tensor, Derivation>> temp;
+			if (is_derive)
+			{
+				temp.push_back(std::make_pair(a, Derivation(values(a.get_buffer().shape(), 1).tensor_cast(a.get_buffer().type(), false), multiply)));
+				temp.push_back(std::make_pair(b, Derivation(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), multiply)));
+			}
+			cudaError cuda_status;
+			TensorBase other_buf;
+			void* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size()));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+{ \
+sum_2_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size); \
+cuda_status = cudaDeviceSynchronize(); \
+other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \
+}
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaFree(c_ptr);
+			return Tensor(std::move(other_buf), std::move(temp));
+		}
+
+		Tensor divide(const Tensor& a, const Tensor& b, bool is_derive)
+		{
+			assert(equal_dim_size(a.get_buffer(), b.get_buffer()));
+			std::vector<std::pair<Tensor, Derivation>> temp;
+			if (is_derive)
+			{
+				temp.push_back(std::make_pair(a, Derivation(divide(values(b.get_buffer().shape(), 1).tensor_cast(b.get_buffer().type(), false), b, false), multiply)));
+				temp.push_back(std::make_pair(b, Derivation(divide(a, power(b, values(b.get_buffer().shape(), 2).tensor_cast(b.get_buffer().type(), false), false), false), multiply)));
+			}
+			cudaError cuda_status;
+			TensorBase other_buf;
+			void* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_a = a.get_buffer().change_device(this_cuda);
+			TensorBase base_b = b.get_buffer().change_device(this_cuda);
+			std::size_t c_size = std::max
+			(
+				a.get_buffer().data_size() / get_sizeof_type(a.get_buffer().type()),
+				b.get_buffer().data_size() / get_sizeof_type(b.get_buffer().type())
+			);
+			cuda_status = cudaMalloc(&c_ptr, std::max(a.get_buffer().data_size(), b.get_buffer().data_size()));
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			dim3 grid_dim(c_size / block_dim.x + (c_size % block_dim.x ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(a.get_buffer().type() == typeid(TYPE) && b.get_buffer().type() == typeid(TYPE)) \
+{ \
+div_2_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(base_a.data()), static_cast<const TYPE*>(base_b.data()), c_size); \
+cuda_status = cudaDeviceSynchronize(); \
+other_buf = TensorBase(typeid(TYPE), a.get_buffer().shape(), c_ptr, this_cuda); \
+}
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaFree(c_ptr);
+			return Tensor(std::move(other_buf), std::move(temp));
+		}
+
+		Tensor Tensor::sigmoid(bool is_derive) const
+		{
+			std::vector<std::pair<Tensor, Derivation>> temp;
+			if (is_derive)
+			{
+				Tensor temp_ones = values(this->get_buffer().shape(), 1.f).tensor_cast(this->get_buffer().type(), false);
+				Tensor temp_sigmoid = this->sigmoid(false);
+				temp.push_back(std::make_pair(*this, Derivation(multiply(temp_sigmoid, add(temp_ones, -temp_sigmoid, false), false, DataBuffer()), multiply)));
+			}
+			cudaError cuda_status;
+			void* out_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			TensorBase base_of_this = this->get_buffer().change_device(this_cuda);
+			cuda_status = cudaMalloc(&out_ptr, this->get_buffer().data_size());
+			dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
+			std::size_t out_size = this->get_buffer().data_size() / get_sizeof_type(this->get_buffer().type());
+			dim3 grid_dim(out_size / block_dim.x + ((out_size % block_dim.x) ? 1U : 0U));
+#define ADD_CODE(TYPE) \
+if(this->get_buffer().type() == typeid(TYPE)) \
+sigmoid_arr<<<grid_dim, block_dim>>>(static_cast<TYPE*>(out_ptr), static_cast<const TYPE*>(base_of_this.data()), out_size);
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaDeviceSynchronize();
+			TensorBase other_buf(this->get_buffer().type(), this->get_buffer().shape(), out_ptr, this_cuda);
+			cuda_status = cudaFree(out_ptr);
+			return Tensor(std::move(other_buf), std::move(temp));
+		}
+    }
+}
+
+#undef LOOP
+#undef BODY
+#undef A
+#undef B
+#undef A_END
+#undef B_END
+#undef END
+#undef END_
+
+#undef USING_DATA_TYPE
diff --git a/src/tensor-array/core/tensor_math_op.hh b/src/tensor-array/core/tensor_math_op.hh
new file mode 100644
index 0000000..1d3c33e
--- /dev/null
+++ b/src/tensor-array/core/tensor_math_op.hh
@@ -0,0 +1,27 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifdef TENSOR_CONTENT
+namespace tensor_array
+{
+    namespace value
+    {
+        Tensor multiply(const Tensor&, const Tensor&, bool, const DataBuffer&);
+        Tensor add(const Tensor&, const Tensor&, bool);
+        Tensor divide(const Tensor&, const Tensor&, bool);
+    }
+}
+#endif
diff --git a/src/tensor-array/core/tensor_reduce.cu b/src/tensor-array/core/tensor_reduce.cu
new file mode 100644
index 0000000..5b6ddfd
--- /dev/null
+++ b/src/tensor-array/core/tensor_reduce.cu
@@ -0,0 +1,368 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <cassert>
+#include <cuda_fp8.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <algorithm>
+#include <limits>
+#include <cassert>
+#ifndef TENSOR_CONTENT
+#define TENSOR_CONTENT
+#include "tensor.hh"
+#undef TENSOR_CONTENT
+#endif
+
+#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e5m2)(__nv_fp8_e4m3)
+#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16)
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT()
+
+#define LOOP(seq) END(A seq)
+#define BODY(x) ADD_CODE(x)
+#define A(x) BODY(x) B
+#define B(x) BODY(x) A
+#define A_END
+#define B_END
+#define END(...) END_(__VA_ARGS__)
+#define END_(...) __VA_ARGS__##_END
+
+
+namespace tensor_array
+{
+    namespace value
+    {
+        using namespace devices;
+
+        template <typename T, unsigned int BatchBlockSize, unsigned int DimBlockSize, unsigned int ContentBlockSize>
+        __device__ void warp_reduce_sum(T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int value)
+        {
+            (*sdata)[threadIdx.x][threadIdx.z][threadIdx.y] += (*sdata)[threadIdx.x][threadIdx.z + value][threadIdx.y];
+        }
+
+        template <typename T, unsigned int BatchBlockSize, unsigned int DimBlockSize, unsigned int ContentBlockSize, typename ... Args>
+        __device__ void warp_reduce_functions
+        (
+            void(*func)(T (*)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int),
+            T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize],
+            Args ... args
+        )
+        {
+            if constexpr (DimBlockSize >= 1024) if (threadIdx.z < 512) func(sdata, 512, args...);
+            if constexpr (DimBlockSize >= 512) if (threadIdx.z < 256) func(sdata, 256, args...);
+            if constexpr (DimBlockSize >= 256) if (threadIdx.z < 128) func(sdata, 128, args...);
+            if constexpr (DimBlockSize >= 128) if (threadIdx.z < 64) func(sdata, 64, args...);
+            if constexpr (DimBlockSize >= 64) if (threadIdx.z < 32) func(sdata, 32, args...);
+            if constexpr (DimBlockSize >= 32) if (threadIdx.z < 16) func(sdata, 16, args...);
+            if constexpr (DimBlockSize >= 16) if (threadIdx.z < 8) func(sdata, 8, args...);
+            if constexpr (DimBlockSize >= 8) if (threadIdx.z < 4) func(sdata, 4, args...);
+            if constexpr (DimBlockSize >= 4) if (threadIdx.z < 2) func(sdata, 2, args...);
+            if constexpr (DimBlockSize >= 2) if (threadIdx.z < 1) func(sdata, 1, args...);
+        }
+
+        template <typename T, unsigned int BatchBlockSize, unsigned int DimBlockSize, unsigned int ContentBlockSize>
+        __device__ void warp_reduce_max(T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int value, unsigned int (*sindex)[BatchBlockSize][DimBlockSize][ContentBlockSize])
+        {
+            if (sdata[threadIdx.x][threadIdx.z][threadIdx.y] < sdata[threadIdx.x][threadIdx.z + value][threadIdx.y])
+            {
+                sdata[threadIdx.x][threadIdx.z + value][threadIdx.y] = sdata[threadIdx.x][threadIdx.z + value][threadIdx.y];
+                sindex[threadIdx.x][threadIdx.z + value][threadIdx.y] = sindex[threadIdx.x][threadIdx.z + value][threadIdx.y];
+            }
+        }
+
+        template <typename T, unsigned int BatchBlockSize, unsigned int DimBlockSize, unsigned int ContentBlockSize>
+        __device__ void warp_reduce_min(T (*sdata)[BatchBlockSize][DimBlockSize][ContentBlockSize], unsigned int value, unsigned int (*sindex)[BatchBlockSize][DimBlockSize][ContentBlockSize])
+        {
+            if (sdata[threadIdx.x][threadIdx.z][threadIdx.y] > sdata[threadIdx.x][threadIdx.z + value][threadIdx.y])
+            {
+                sdata[threadIdx.x][threadIdx.z][threadIdx.y] = sdata[threadIdx.x][threadIdx.z + value][threadIdx.y];
+                sindex[threadIdx.x][threadIdx.z][threadIdx.y] = sindex[threadIdx.x][threadIdx.z + value][threadIdx.y];
+            }
+        }
+
+        template <typename T, unsigned int BatchBlockSize, unsigned int BlockSize, unsigned int ContentBlockSize>
+        __global__ void array_reduce_sum(T *g_odata, const T *g_idata, unsigned int batch_size, unsigned int n, unsigned int content_size)
+        {
+            __shared__ T sdata[BatchBlockSize][BlockSize][ContentBlockSize];
+            unsigned int batch_id = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int content_id = blockIdx.y * blockDim.y + threadIdx.y;
+            unsigned int tid = threadIdx.z;
+            unsigned int gridSize = blockDim.z * gridDim.z;
+            sdata[threadIdx.x][threadIdx.z][threadIdx.y] = 0;
+            if (batch_id < batch_size && blockIdx.z * blockDim.z + tid < n && content_id < content_size)
+                sdata[threadIdx.x][threadIdx.z][threadIdx.y] +=
+                g_idata
+                [
+                    batch_id * n * content_size +
+                    tid * content_size +
+                    content_id
+                ];
+            __syncthreads();
+            if (tid < 512)
+                warp_reduce_functions(&warp_reduce_sum, &sdata);
+            if (tid == 0)
+                g_odata[
+                    batch_id * blockDim.z * content_size +
+                    tid * content_size +
+                    content_id
+                ] = sdata[threadIdx.x][threadIdx.z][threadIdx.y];
+        }
+
+        template <typename T, unsigned int BatchBlockSize, unsigned int BlockSize, unsigned int ContentBlockSize>
+        __global__ void array_reduce_max(T *g_odata, unsigned int *g_oindex, const T *g_idata, unsigned int batch_size, unsigned int n, unsigned int content_size)
+        {
+            __shared__ T sdata[BatchBlockSize][BlockSize][ContentBlockSize];
+            __shared__ unsigned int sindex[BatchBlockSize][BlockSize][ContentBlockSize];
+            unsigned int batch_id = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int content_id = blockIdx.y * blockDim.y + threadIdx.y;
+            unsigned int tid = threadIdx.z;
+            unsigned int gridSize = blockDim.z * gridDim.z;
+            sdata[threadIdx.x][threadIdx.z][threadIdx.y] = -std::numeric_limits<T>::infinity();
+            sindex[threadIdx.x][threadIdx.z][threadIdx.y] = threadIdx.z;
+            if (batch_id < batch_size && blockIdx.z * blockDim.z + tid < n && content_id < content_size)
+                sdata[threadIdx.x][threadIdx.z][threadIdx.y] =
+                g_idata
+                [
+                    batch_id * n * content_size +
+                    tid * content_size +
+                    content_id
+                ];
+            __syncthreads();
+            if (tid < 512)
+                warp_reduce_functions(&warp_reduce_max, &sdata, &sindex);
+            if (tid == 0)
+            {
+                g_odata[
+                    batch_id * blockDim.z * content_size +
+                    blockIdx.z * content_size +
+                    content_id
+                ] = sdata[threadIdx.x][threadIdx.z][threadIdx.y];
+                g_oindex[
+                    batch_id * blockDim.z * content_size +
+                    blockIdx.z * content_size +
+                    content_id
+                ] = sindex[threadIdx.x][threadIdx.z][threadIdx.y];
+            }
+        }
+
+        template <typename T, unsigned int BatchBlockSize, unsigned int BlockSize, unsigned int ContentBlockSize>
+        __global__ void array_reduce_min(T *g_odata, unsigned int *g_oindex, const T *g_idata, unsigned int batch_size, unsigned int n, unsigned int content_size)
+        {
+            __shared__ T sdata[BatchBlockSize][BlockSize][ContentBlockSize];
+            __shared__ unsigned int sindex[BatchBlockSize][BlockSize][ContentBlockSize];
+            unsigned int batch_id = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int content_id = blockIdx.y * blockDim.y + threadIdx.y;
+            unsigned int tid = threadIdx.z;
+            unsigned int gridSize = blockDim.z * gridDim.z;
+            sdata[threadIdx.x][threadIdx.z][threadIdx.y] = std::numeric_limits<T>::infinity();
+            sindex[threadIdx.x][threadIdx.z][threadIdx.y] = threadIdx.z;
+            if (batch_id < batch_size && blockIdx.z * blockDim.z + tid < n && content_id < content_size)
+                sdata[threadIdx.x][threadIdx.z][threadIdx.y] =
+                g_idata
+                [
+                    batch_id * n * content_size +
+                    tid * content_size +
+                    content_id
+                ];
+            __syncthreads();
+            if (tid < 512)
+                warp_reduce_functions(&warp_reduce_min, &sdata, &sindex);
+            if (tid == 0)
+            {
+                g_odata[
+                    batch_id * blockDim.z * content_size +
+                    blockIdx.z * content_size +
+                    content_id
+                ] = sdata[threadIdx.x][threadIdx.z][threadIdx.y];
+                g_oindex[
+                    batch_id * blockDim.z * content_size +
+                    blockIdx.z * content_size +
+                    content_id
+                ] = sindex[threadIdx.x][threadIdx.z][threadIdx.y];
+            }
+        }
+
+        bool equal_dim_size(const TensorBase&, const TensorBase&);
+        Tensor derive_reduce_sum(const Tensor& a, const Tensor& b, bool is_derive, const DataBuffer& databuf)
+        {
+            return multiply(a, b, is_derive, databuf);
+        }
+
+        Tensor Tensor::reduce_sum(unsigned char dim) const
+		{
+            std::vector<unsigned int> shape_c = this->get_buffer().shape();
+            assert(dim < shape_c.size());
+			std::vector<std::pair<Tensor, Derivation>> temp;
+			temp.push_back(std::make_pair(*this, Derivation(values(shape_c, 1).tensor_cast(this->get_buffer().type(), false), derive_reduce_sum)));
+			cudaError_t cuda_status;
+			TensorBase other_buf;
+
+			void* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			const TensorBase& base_a = this->get_buffer();
+			cuda_status = cudaMalloc(&c_ptr, base_a.data_size());
+            device_memcpy(&c_ptr, this_cuda, base_a.data(), base_a.get_device(), base_a.data_size());
+
+            unsigned int dim_x = 1;
+            for (unsigned char i = 0; i < dim; i++)
+                dim_x *= shape_c[i];
+
+            unsigned int dim_y = 1;
+            for (unsigned char i = dim+1; i < shape_c.size(); i++)
+                dim_y *= shape_c[i];
+            
+            constexpr unsigned int thread_value_x = 8U;
+            constexpr unsigned int thread_value_y = 16U;
+            constexpr unsigned int thread_value_z = 8U;
+			dim3 block_dim(thread_value_x, thread_value_y, thread_value_z);
+			dim3 grid_dim
+            (
+                dim_x / block_dim.x + (dim_x % block_dim.x  ? 1U : 0U),
+                dim_y / block_dim.y + (dim_y % block_dim.y ? 1U : 0U),
+                shape_c[dim] / block_dim.z + (shape_c[dim] % block_dim.z ? 1U : 0U)
+            );
+#define ADD_CODE(TYPE) \
+if(base_a.type() == typeid(TYPE)) \
+{ \
+while (shape_c[dim] > 1) \
+{ \
+array_reduce_sum<TYPE, thread_value_x, thread_value_z, thread_value_y><<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(c_ptr), dim_x, shape_c[dim], dim_y); \
+cuda_status = cudaDeviceSynchronize(); \
+shape_c[dim] = grid_dim.z; \
+grid_dim.z = grid_dim.z / block_dim.z  + (grid_dim.z % block_dim.z ? 1U : 0U); \
+} \
+other_buf = TensorBase(typeid(TYPE), shape_c, c_ptr, this_cuda); \
+}
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+			cuda_status = cudaFree(c_ptr);
+			return Tensor(std::move(other_buf));
+		}
+
+        Tensor Tensor::reduce_max(unsigned char dim) const
+		{
+            std::vector<unsigned int> shape_c = this->get_buffer().shape();
+            assert(dim < shape_c.size());
+			cudaError_t cuda_status;
+			TensorBase other_buf;
+			void* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			const TensorBase& base_a = this->get_buffer();
+			cuda_status = cudaMalloc(&c_ptr, base_a.data_size());
+            device_memcpy(&c_ptr, this_cuda, base_a.data(), base_a.get_device(), base_a.data_size());
+
+            unsigned int dim_x = 1;
+            for (unsigned char i = 0; i < dim; i++)
+                dim_x *= shape_c[i];
+
+            unsigned int dim_y = 1;
+            for (unsigned char i = dim+1; i < shape_c.size(); i++)
+                dim_y *= shape_c[i];
+            
+            constexpr unsigned int thread_value_x = 8U;
+            constexpr unsigned int thread_value_y = 16U;
+            constexpr unsigned int thread_value_z = 8U;
+			dim3 block_dim(thread_value_x, thread_value_y, thread_value_z);
+			dim3 grid_dim
+            (
+                dim_x / block_dim.x + (dim_x % block_dim.x  ? 1U : 0U),
+                dim_y / block_dim.y + (dim_y % block_dim.y ? 1U : 0U),
+                shape_c[dim] / block_dim.z + (shape_c[dim] % block_dim.z ? 1U : 0U)
+            );
+#define ADD_CODE(TYPE) \
+if(base_a.type() == typeid(TYPE)) \
+{ \
+while (shape_c[dim] > 1) \
+{ \
+array_reduce_sum<TYPE, thread_value_x, thread_value_z, thread_value_y><<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(c_ptr), dim_x, shape_c[dim], dim_y); \
+cuda_status = cudaDeviceSynchronize(); \
+shape_c[dim] = grid_dim.z; \
+grid_dim.z = grid_dim.z / block_dim.z  + (grid_dim.z % block_dim.z ? 1U : 0U); \
+} \
+other_buf = TensorBase(typeid(TYPE), base_a.shape(), c_ptr, this_cuda); \
+}
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+            std::vector<std::pair<Tensor, Derivation>> temp;
+			temp.push_back(std::make_pair(*this, Derivation(values(this->get_buffer().shape(), 1).tensor_cast(this->get_buffer().type(), false), derive_reduce_sum)));
+			cuda_status = cudaFree(c_ptr);
+			return Tensor(std::move(other_buf));
+		}
+
+        Tensor Tensor::reduce_min(unsigned char dim) const
+		{
+            std::vector<unsigned int> shape_c = this->get_buffer().shape();
+            assert(dim < shape_c.size());
+			cudaError_t cuda_status;
+			TensorBase other_buf;
+			void* c_ptr;
+			devices::Device this_cuda{ devices::CUDA };
+			cuda_status = cudaGetDevice(&this_cuda.index);
+			cudaDeviceProp cu_dev_prop;
+			cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
+			const TensorBase& base_a = this->get_buffer();
+			cuda_status = cudaMalloc(&c_ptr, base_a.data_size());
+            device_memcpy(&c_ptr, this_cuda, base_a.data(), base_a.get_device(), base_a.data_size());
+            unsigned int dim_x = 1;
+            for (unsigned char i = 0; i < dim; i++)
+                dim_x *= shape_c[i];
+
+            unsigned int dim_y = 1;
+            for (unsigned char i = dim+1; i < shape_c.size(); i++)
+                dim_y *= shape_c[i];
+            
+            constexpr unsigned int thread_value_x = 8U;
+            constexpr unsigned int thread_value_y = 16U;
+            constexpr unsigned int thread_value_z = 8U;
+			dim3 block_dim(thread_value_x, thread_value_y, thread_value_z);
+			dim3 grid_dim
+            (
+                dim_x / block_dim.x + (dim_x % block_dim.x  ? 1U : 0U),
+                dim_y / block_dim.y + (dim_y % block_dim.y ? 1U : 0U),
+                shape_c[dim] / block_dim.z + (shape_c[dim] % block_dim.z ? 1U : 0U)
+            );
+#define ADD_CODE(TYPE) \
+if(base_a.type() == typeid(TYPE)) \
+{ \
+while (shape_c[dim] > 1) \
+{ \
+array_reduce_sum<TYPE, thread_value_x, thread_value_z, thread_value_y><<<grid_dim, block_dim>>>(static_cast<TYPE*>(c_ptr), static_cast<const TYPE*>(c_ptr), dim_x, shape_c[dim], dim_y); \
+cuda_status = cudaDeviceSynchronize(); \
+shape_c[dim] = grid_dim.z; \
+grid_dim.z = grid_dim.z / block_dim.z  + (grid_dim.z % block_dim.z ? 1U : 0U); \
+} \
+other_buf = TensorBase(typeid(TYPE), base_a.shape(), c_ptr, this_cuda); \
+}
+			LOOP(USING_DATA_TYPE);
+#undef ADD_CODE
+            std::vector<std::pair<Tensor, Derivation>> temp;
+            temp.push_back(std::make_pair(*this, Derivation(values(this->get_buffer().shape(), 1).tensor_cast(this->get_buffer().type(), false), derive_reduce_sum)));
+			cuda_status = cudaFree(c_ptr);
+			return Tensor(std::move(other_buf));
+		}
+    }
+}
diff --git a/src/tensor_array/core/tensorarray.hh b/src/tensor-array/core/tensorarray.hh
similarity index 100%
rename from src/tensor_array/core/tensorarray.hh
rename to src/tensor-array/core/tensorarray.hh
diff --git a/src/tensor_array/core/tensorbase.cc b/src/tensor-array/core/tensorbase.cc
similarity index 96%
rename from src/tensor_array/core/tensorbase.cc
rename to src/tensor-array/core/tensorbase.cc
index 55c5398..d3cc7a7 100644
--- a/src/tensor_array/core/tensorbase.cc
+++ b/src/tensor-array/core/tensorbase.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include <cstdio>
 #include <cstring>
 
-#define USING_DATA_TYPE_FLOAT (float)(double)
-#define USING_DATA_TYPE_SINT (int8_t)(int16_t)(int32_t)(int64_t)
-#define USING_DATA_TYPE_UINT (uint8_t)(uint16_t)(uint32_t)(uint64_t)
-#define USING_DATA_TYPE USING_DATA_TYPE_SINT USING_DATA_TYPE_UINT USING_DATA_TYPE_FLOAT
+#define USING_DATA_TYPE_FLOAT() (float)(double)
+#define USING_DATA_TYPE_SINT() (int8_t)(int16_t)(int32_t)(int64_t)
+#define USING_DATA_TYPE_UINT() (uint8_t)(uint16_t)(uint32_t)(uint64_t)
+#define USING_DATA_TYPE USING_DATA_TYPE_SINT() USING_DATA_TYPE_UINT() USING_DATA_TYPE_FLOAT()
 
 #define LOOP(seq) END(A seq)
 #define BODY(x) ADD_CODE(x)
diff --git a/src/tensor_array/core/tensorbase.hh b/src/tensor-array/core/tensorbase.hh
similarity index 94%
rename from src/tensor_array/core/tensorbase.hh
rename to src/tensor-array/core/tensorbase.hh
index 0c0e863..e7c42d5 100644
--- a/src/tensor_array/core/tensorbase.hh
+++ b/src/tensor-array/core/tensorbase.hh
@@ -23,14 +23,14 @@ limitations under the License.
 #include "initializer_wrapper.hh"
 #pragma once
 
-#ifdef __WIN32__
-#ifdef CUDA_ML_EXPORTS
-#define CUDA_ML_API __declspec(dllexport)
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_CORE_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
 #else
-#define CUDA_ML_API __declspec(dllimport)
+#define TENSOR_ARRAY_API __declspec(dllimport)
 #endif
 #else
-#define CUDA_ML_API
+#define TENSOR_ARRAY_API
 #endif
 
 namespace tensor_array
@@ -40,7 +40,7 @@ namespace tensor_array
         /**
          * \brief This class look like std::any but it tensor.
          */
-        class CUDA_ML_API TensorBase
+        class TENSOR_ARRAY_API TensorBase
         {
         private:
             struct TensorStorage
@@ -64,7 +64,7 @@ namespace tensor_array
             class TensorArrayStorage<T, sz0, sz...> final : public TensorStorage
             {
             private:
-                static constexpr const std::array<unsigned int, sizeof...(sz) + 1ULL> dim_size_array{ sz0, sz... };
+                static constexpr inline const unsigned int dim_size_array[sizeof...(sz) + 1ULL] = { sz0, sz... };
                 const TensorArray<T, sz0, sz...> arr_data;
             public:
                 constexpr TensorArrayStorage(const TensorArray<T, sz0, sz...>& arr_data) :
@@ -87,7 +87,7 @@ namespace tensor_array
 
                 inline std::initializer_list<unsigned int> dim_sizes() const override
                 {
-                    return wrapper::initializer_wrapper<unsigned int>(dim_size_array.data(), dim_size_array.data() + sizeof...(sz) + 1ULL);
+                    return wrapper::initializer_wrapper<unsigned int>(dim_size_array, dim_size_array + sizeof...(sz) + 1ULL);
                 }
 
                 inline const void* data() const override
@@ -210,7 +210,7 @@ namespace tensor_array
         };
 
         std::size_t get_sizeof_type(const std::type_info&);
-}
+    }
 }
 
-#undef CUDA_ML_API
\ No newline at end of file
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor-array/interp/main.c b/src/tensor-array/interp/main.c
new file mode 100644
index 0000000..7aa7285
--- /dev/null
+++ b/src/tensor-array/interp/main.c
@@ -0,0 +1,117 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "option.h"
+#include "open_file.h"
+#include "parser.h"
+
+void initialize(int argc, char *argv[])
+{
+    int i, fd;
+    while (argc <= 0)
+    {
+        char *argv_opt = "";
+        size_t poolsize = 1024; // Default pool size
+        switch (argv_opt[0])
+        {
+        case '-':
+            switch (argv_opt[1])
+            {
+                case 'h':
+                    help();
+                    return;
+                case 'v':
+                    version();
+                    return;
+                case 'f':
+                    if (argc < 2)
+                    {
+                        fprintf(stderr, "Error: No file specified after -f option\n");
+                        exit(1);
+                        return;
+                    }
+                    read_file(argv[1]);
+                    argc--;
+                    argv++;
+                    return;
+                case '-':
+                    if (strcmp(argv_opt, "--help") == 0)
+                    {
+                        help();
+                        return;
+                    }
+                    else if (strcmp(argv_opt, "--version") == 0)
+                    {
+                        version();
+                        return;
+                    }
+                    else if (strcmp(argv_opt, "--poolsize") == 0)
+                    {
+                        if (argc < 2)
+                        {
+                            fprintf(stderr, "Error: No pool size specified after --poolsize option\n");
+                            exit(1);
+                            return;
+                        }
+                        poolsize = atoi(argv[1]);
+                        if (poolsize <= 0)
+                        {
+                            fprintf(stderr, "Error: Invalid pool size specified\n");
+                            exit(1);
+                            return;
+                        }
+                        argc--;
+                        argv++;
+                    }
+                    else if (strcmp(argv_opt, "--file") == 0)
+                    {
+                        if (argc < 2)
+                        {
+                            fprintf(stderr, "Error: No file specified after --file option\n");
+                            exit(1);
+                            return;
+                        }
+                        read_file(argv[1]);
+                        argc--;
+                        argv++;
+                    }
+                    return;
+                default:
+                    read_file(argv[0]);
+                    return;
+            }
+            break;
+        default:
+            break;
+        }
+        argc--;
+        argv++;
+    }
+    
+}
+
+int main(int argc, char *argv[])
+{
+    printf("Hello\n");
+    initialize(argc-1, argv+1);
+    program();
+    return 0;
+}
+// Future implementations may include command-line argument parsing, initialization of the TensorArray library,
+// and other necessary setup for the interp functionality.
diff --git a/src/tensor-array/interp/open_file.c b/src/tensor-array/interp/open_file.c
new file mode 100644
index 0000000..63a34f6
--- /dev/null
+++ b/src/tensor-array/interp/open_file.c
@@ -0,0 +1,79 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "open_file.h"
+
+char *src = NULL;
+VM_INSTRUCTION *text = NULL;
+size_t poolsize = 1024; // Default pool size
+
+void interp_malloc()
+{
+    src = malloc(poolsize);
+    if (src == NULL)
+    {
+        fprintf(stderr, "Error: Could not allocate memory for interpreter\n");
+        exit(1);
+    }
+    text = malloc(poolsize*sizeof(VM_INSTRUCTION));
+    if (text == NULL)
+    {
+        fprintf(stderr, "Error: Could not allocate memory for interpreter text\n");
+        free(src);
+        exit(1);
+    }
+}
+
+void interp_memreset()
+{
+    memset(text, 0, poolsize);
+    memset(src, 0, poolsize);
+}
+
+void interp_free()
+{
+    free(text);
+    free(src);
+}
+
+void read_file(const char* filename)
+{
+    FILE* fptr = fopen(filename, "r");
+    if (fptr == NULL)
+    {
+        fprintf(stderr, "Error: Could not open file %s\n", filename);
+        exit(1);
+    }
+    
+    int i;
+    interp_malloc();
+    interp_memreset();
+    i = fread(src, poolsize, 1, fptr);
+    if (i < 0)
+    {
+        fprintf(stderr, "Error: Could not read file %s\n", filename);
+        fclose(fptr);
+        exit(1);
+    }
+    orig = text;
+    text = text - 1;
+    src[i] = '\0';
+    fclose(fptr);
+    
+}
diff --git a/src/tensor-array/interp/open_file.h b/src/tensor-array/interp/open_file.h
new file mode 100644
index 0000000..7d37999
--- /dev/null
+++ b/src/tensor-array/interp/open_file.h
@@ -0,0 +1,27 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "vm_instruction.h"
+#include "vm.h"
+
+extern char *src;
+extern VM_INSTRUCTION *text;
+extern size_t poolsize;
+
+void interp_malloc();
+void interp_memreset();
+void interp_free();
+void read_file(const char*);
diff --git a/src/tensor-array/interp/option.c b/src/tensor-array/interp/option.c
new file mode 100644
index 0000000..ef83c45
--- /dev/null
+++ b/src/tensor-array/interp/option.c
@@ -0,0 +1,32 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdio.h>
+
+void help()
+{
+    printf("Usage: tensor-array [options]\n");
+    printf("Options:\n");
+    printf("  -h, --help                  Show this help message\n");
+    printf("  -v, --version               Show version information\n");
+    printf("      --poolsize    SIZE      Set the pool size (default: 1024)\n");
+    printf("  -f, --file        FILE      Open the specified file\n");
+}
+
+void version()
+{
+    printf("Tensor Array Interpreter Version 0.1.0\n");
+}
\ No newline at end of file
diff --git a/src/tensor-array/interp/option.h b/src/tensor-array/interp/option.h
new file mode 100644
index 0000000..168b613
--- /dev/null
+++ b/src/tensor-array/interp/option.h
@@ -0,0 +1,18 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+void help();
+void version();
diff --git a/src/tensor-array/interp/parser.c b/src/tensor-array/interp/parser.c
new file mode 100644
index 0000000..0bd9d0c
--- /dev/null
+++ b/src/tensor-array/interp/parser.c
@@ -0,0 +1,320 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "parser.h"
+#include "token.h"
+#include "open_file.h"
+#include "vm_instruction.h"
+#include "sym_map.h"
+#include "vm_type.h"
+
+void emit(unsigned size, ...)
+{
+    va_list args;
+    va_start(args, size);
+    
+    // Process the variable arguments as needed
+    for (unsigned i = 0; i < size; ++i) {
+        ++text;
+        *text = va_arg(args, VM_INSTRUCTION);
+    }
+    
+    va_end(args);
+}
+
+void match(int tk)
+{
+    if (tkn == tk) {
+        token_next(); // Move to the next token
+    } else {
+        if (tk < 0x80) {
+            fprintf(stderr, "Error: Expected token %ld but found %ld\n", tk, tkn);
+        } else {
+            char* tn = tknname[tk - 0x80];
+            fprintf(stderr, "Error: Expected token %s but found %s\n", tn, tkn);
+        }
+        exit(1);
+    }
+}
+
+void expression(int level)
+{
+    sym_data* temp = NULL; // Temporary variable to hold intermediate values
+    int isArrRef = 0; // Flag to check if we are dealing with an array reference
+    // This function would handle parsing and evaluating expressions
+    // For now, it is a placeholder
+    // You can implement your logic here
+    switch (tkn)
+    {
+    case TOKEN_NUM:
+        /* code */
+        emit(3, IMM, TYPE_INT, tkn_val);
+        match(TOKEN_NUM);
+        break;
+    case TOKEN_ID:
+        /* code */
+        temp = sym_cur;
+        match(TOKEN_ID);
+        if (temp->type)
+        {
+            if (tkn == '(')
+            {
+                /* code */
+                match('(');
+                match(')');
+                emit(2, CALL, temp->data);
+            }
+            
+        }
+        else
+        {
+            emit(3, IMM, TYPE_PTR, tkn_val);
+            emit(1, GET);
+        }
+        break;
+    case '"':
+        {
+            emit(3, IMM, TYPE_STRING, tkn_val);
+            match('"'); // Match the opening quote
+        }
+        break;
+    case '[':
+        if (temp == NULL)
+        {
+            *text = PTR_PUSH; // Push the current value onto the stack
+            match('['); // Match the opening bracket
+            expression(TOKEN_ASSIGN); // Parse the expression inside the brackets
+            emit(1, GETELEM); // Emit get element instruction
+            match(']'); // Match the closing bracket
+        }
+        break;
+    default:
+        break;
+    }
+
+    while (tkn >= level)
+    {
+        switch (tkn)
+        {
+        case TOKEN_ASSIGN:
+            if (*text != GET && *text != GETELEM)
+            {
+                fprintf(stderr, "Error: Assignment without a variable\n");
+                exit(1);
+            }
+            *text = PTR_PUSH; // Push the current value onto the stack
+            match(TOKEN_ASSIGN);
+            expression(TOKEN_ASSIGN); // Parse the right-hand side expression
+            if (isArrRef) emit(1, SETELEM); // Emit set element instruction if it's an array reference
+            else emit(1, SET); // Emit set instruction
+            break;
+        case TOKEN_EQ:
+            emit(1, PUSH);
+            match(TOKEN_EQ);
+            expression(TOKEN_SHL); // Parse the right-hand side expression
+            emit(1, EQ); // Emit equality instruction
+            break;
+        case TOKEN_NE:
+            emit(1, PUSH);
+            match(TOKEN_NE);
+            expression(TOKEN_SHL); // Parse the right-hand side expression
+            emit(1, NE); // Emit not equal instruction
+            break;
+        case TOKEN_LT:
+            emit(1, PUSH);
+            match(TOKEN_LT);
+            expression(TOKEN_SHL); // Parse the right-hand side expression
+            emit(1, LT); // Emit less than instruction
+            break;
+        case TOKEN_GT:
+            emit(1, PUSH);
+            match(TOKEN_GT);
+            expression(TOKEN_SHL); // Parse the right-hand side expression
+            emit(1, GT); // Emit greater than instruction
+            break;
+        case TOKEN_LE:
+            emit(1, PUSH);
+            match(TOKEN_LE);
+            expression(TOKEN_SHL); // Parse the right-hand side expression
+            emit(1, LE); // Emit less than or equal instruction
+            break;
+        case TOKEN_GE:
+            emit(1, PUSH);
+            match(TOKEN_GE);
+            expression(TOKEN_SHL); // Parse the right-hand side expression
+            emit(1, GE); // Emit greater than or equal instruction
+            break;
+        case TOKEN_SHL:
+            emit(1, PUSH);
+            match(TOKEN_SHL);
+            expression(TOKEN_ADD); // Parse the right-hand side expression
+            emit(1, SHL); // Emit shift left instruction
+            break;
+        case TOKEN_SHR:
+            emit(1, PUSH);
+            match(TOKEN_SHR);
+            expression(TOKEN_ADD); // Parse the right-hand side expression
+            emit(1, SHR); // Emit shift right instruction
+            break;
+        case TOKEN_ADD:
+            emit(1, PUSH);
+            match(TOKEN_ADD);
+            expression(TOKEN_MUL); // Parse the right-hand side expression
+            emit(1, ADD); // Emit add instruction
+            break;
+        case TOKEN_SUB:
+            emit(1, PUSH);
+            match(TOKEN_SUB);
+            expression(TOKEN_MUL); // Parse the right-hand side expression
+            emit(1, SUB); // Emit subtract instruction
+            break;
+        case TOKEN_MUL:
+            emit(1, PUSH);
+            match(TOKEN_MUL);
+            expression(TOKEN_MATMUL); // Parse the right-hand side expression
+            emit(1, MUL); // Emit multiply instruction
+            break;
+        case TOKEN_DIV:
+            emit(1, PUSH);
+            match(TOKEN_DIV);
+            expression(TOKEN_MATMUL); // Parse the right-hand side expression
+            emit(1, DIV); // Emit divide instruction
+            break;
+        case TOKEN_MATMUL:
+            emit(1, PUSH);
+            match(TOKEN_MATMUL);
+            expression(TOKEN_INC); // Parse the right-hand side expression
+            emit(1, MATMUL); // Emit matrix multiply instruction
+            break;
+        default:
+            fprintf(stderr, "Error: Unrecognized token in expression\n");
+            exit(1);
+        }
+    }
+    
+}
+
+void statement()
+{
+    // This function would handle parsing and executing statements
+    // For now, it is a placeholder
+    // You can implement your logic here
+    switch (tkn)
+    {
+    case TOKEN_IF:
+        {
+            match(TOKEN_IF);
+            match('(');
+            expression(TOKEN_ASSIGN); // Parse the condition expression
+            match(')');
+            emit(1, JZ); // Emit jump if zero instruction
+            VM_INSTRUCTION *b = ++text; // Placeholder for jump address
+            statement(); // Parse the statement inside the if block
+            if (tkn == TOKEN_ELSE)
+            {
+                match(TOKEN_ELSE);
+                emit(1, JMP); // Emit jump instruction
+                *b = text + 2; // Set the jump address to the next instruction
+                b = ++text;
+                statement(); // Parse the else block
+            }
+            *b = text + 1; // Set the jump address to the next instruction
+        }
+        break;
+    case TOKEN_WHILE:
+        {
+            VM_INSTRUCTION *a = NULL; // Placeholder for jump address
+            VM_INSTRUCTION *b = text+1; // Placeholder for jump address
+            match(TOKEN_WHILE);
+            match('(');
+            expression(TOKEN_ASSIGN); // Parse the condition expression
+            match(')');
+            emit(1, JZ); // Emit jump if zero instruction
+            a=++text; // Set the jump address to the start of the while block
+            statement(); // Parse the statement inside the while block
+            emit(1, JMP); // Emit jump instruction to loop back
+            emit(1, b); // Emit the address to jump back to
+            *a = text + 1; // Set the jump address to the next instruction
+        }
+        break;
+    case TOKEN_FUNC:
+        match(TOKEN_FUNC);
+        if (tkn != TOKEN_ID)
+        {
+            fprintf(stderr, "Error: function name\n");
+            exit(1);
+        }
+        sym_cur->type = TYPE_FUNC;
+        sym_cur->data = malloc(1024*8);
+        VM_INSTRUCTION *save  = text;
+        text = sym_cur->data;
+        match(TOKEN_ID);
+        match('(');
+        match(')');
+        statement();
+        if (*text != RET) emit(1, RET);
+        text = save;
+        break;
+    case TOKEN_RETURN:
+        match(TOKEN_RETURN);
+        expression(TOKEN_ASSIGN);
+        emit(1, RET);
+        break;
+    case '{':
+        match('{');
+        while (tkn != '}')
+            statement();
+        match('}');
+        break;
+    case '\0':
+        return;
+    default:
+        expression(TOKEN_ASSIGN);
+        if (tkn == ';')
+            match(';');
+        break;
+    }
+}
+
+void program()
+{
+    while (1)
+    {
+        // This is a placeholder for the main program loop
+        // You would typically call emit or other functions here based on your program logic
+        // Add your program logic here
+        interp_malloc();
+        orig = text+1;
+        char *isrc = src;
+        VM_INSTRUCTION *itext = text;
+        interp_memreset();
+        printf(">>> ");
+        fflush(stdout);
+        fgets(src, poolsize-1, stdin); // Read input from stdin
+        token_next();
+        statement();
+        emit(1, EXIT); // Emit a token with value 0 to indicate end of processing
+        eval();
+        printf("eval \n");
+        puts("");
+        free(itext);
+        free(isrc);
+    }
+}
diff --git a/src/tensor-array/interp/parser.h b/src/tensor-array/interp/parser.h
new file mode 100644
index 0000000..021ba4a
--- /dev/null
+++ b/src/tensor-array/interp/parser.h
@@ -0,0 +1,17 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+void program();
diff --git a/src/tensor-array/interp/sym_map.cc b/src/tensor-array/interp/sym_map.cc
new file mode 100644
index 0000000..9794848
--- /dev/null
+++ b/src/tensor-array/interp/sym_map.cc
@@ -0,0 +1,38 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstring>
+#include <tensor-array/core/tensor.hh>
+#include "sym_map.h"
+
+sym_data* sym_cur = NULL;
+
+scope sym_map;
+
+void sym_data_set(char* name, sym_data dat)
+{
+    sym_map[name] = dat;
+}
+
+sym_data* sym_data_get(char* name)
+{
+    return &sym_map[name];
+}
+
+int glob_data_find(char* name)
+{
+    return sym_map.find(name) != sym_map.end();
+}
diff --git a/src/tensor-array/interp/sym_map.h b/src/tensor-array/interp/sym_map.h
new file mode 100644
index 0000000..b60eeb0
--- /dev/null
+++ b/src/tensor-array/interp/sym_map.h
@@ -0,0 +1,40 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    typedef struct
+    {
+        int tkn;
+        int hash;
+        int cls;
+        int type;
+        void* data; // Pointer to additional data if needed
+    } sym_data;
+    void sym_data_set(char* name, sym_data dat);
+    sym_data* sym_data_get(char*);
+    int glob_data_find(char* name);
+    extern sym_data* sym_cur;
+#ifdef __cplusplus
+}
+
+#include <map>
+#include <string>
+typedef std::map<std::string, sym_data> scope;
+extern scope sym_map;
+#endif
diff --git a/src/tensor-array/interp/token.c b/src/tensor-array/interp/token.c
new file mode 100644
index 0000000..888150d
--- /dev/null
+++ b/src/tensor-array/interp/token.c
@@ -0,0 +1,313 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sym_map.h"
+#include "open_file.h"
+#include "token.h"
+
+int tkn = 0;
+int tkn_val = 0; // Variable to hold the value of the current token
+
+char* tknname[] = {
+    "num", "sys", "glo", "loc", "id",
+    "func", "else", "enum", "if", "return", "sizeof",
+    "while", "assign", "cond", "lor", "lan",
+    "or", "xor", "and",
+    "eq", "ne", "lt", "gt", "le", "ge",
+    "add", "sub", "mul", "div", "matmul", "pos", "neg", "not", "brak"
+};
+void token_next()
+{
+    while ((tkn = *src++) != '\0')
+    {
+        switch (tkn)
+        {
+        case ' ':
+        case '\t':
+            /* code */
+            break;
+        case '#':
+            /* code */
+            while (*src != '\n' && *src != '\0') src++;
+            break;
+        case '"':
+        case '\'':
+            {
+                char* last_pos = src;
+                while (*src != tkn && *src != '\0') src++;
+                char *st1 = malloc(src-last_pos+2);
+                memset(st1, 0, src-last_pos+2);
+                src = last_pos;
+                for (unsigned int i = 0; *src != tkn && *src != '\0'; i++)
+                {
+                    tkn_val=*src++;
+                    if (tkn_val == '\\')
+                    {
+                        switch (*++src)
+                        {
+                        case '\\':
+                            tkn_val='\\';
+                            break;
+                        case 'r':
+                            tkn_val='\r';
+                            break;
+                        case 'n':
+                            tkn_val='\n';
+                            break;
+                        default:
+                            if (*src == tkn) tkn_val=tkn;
+                            else
+                            {
+                                fprintf(stderr, "invalid character string.");
+                                exit(1);
+                            }
+                            break;
+                        }
+                    }
+                    st1[i] = tkn_val;
+                }
+                tkn_val = st1; // Store the start of the string literal
+            }
+            return; // Exit after processing the string literal
+        case '/':
+            switch (src[0])
+            {
+            case '/':
+                /* code */
+                while (*src != '\n' && *src != '\0') src++;
+                break;
+            case '*':
+                /* code */
+                src++;
+                while (*src != '\0' && !(src[0] == '*' && src[1] == '/')) src++;
+                if (*src == '\0') {
+                    fprintf(stderr, "Error: Unmatched comment block\n");
+                    exit(1);
+                }
+                src += 2; // Skip past the closing */
+                break;
+            case '=':
+                src++;
+                tkn = TOKEN_DIV; // Store the token value
+                return; // Exit after processing the division operator
+            default:
+                tkn = TOKEN_DIV; // Store the token value
+                return; // Exit after processing the division operator
+            }
+            
+        case '*':
+            if (*src == '=')
+            {
+                src++;
+                tkn = TOKEN_MUL; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                tkn = TOKEN_MUL; // Store the token value
+                return; // Exit after processing the token
+            }
+        case '+':
+            if (*src == '=')
+            {
+                src++;
+                tkn = TOKEN_ADD; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                tkn = TOKEN_ADD; // Store the token value
+                return; // Exit after processing the token
+            }
+        case '-':
+            if (*src == '=')
+            {
+                src++;
+                tkn = TOKEN_SUB; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                tkn = TOKEN_SUB; // Store the token value
+                return; // Exit after processing the token
+            }
+        case '=':
+            if (*src == '=')
+            {
+                src++;
+                tkn = TOKEN_EQ; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                tkn = TOKEN_ASSIGN; // Store the token value
+                return; // Exit after processing the token
+            }
+        case '!':
+            if (*src == '=')
+            {
+                src++;
+                tkn = TOKEN_NE; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                tkn = TOKEN_NOT; // Store the token value
+                return; // Exit after processing the token
+            }
+        case '<':
+            if (*src == '=')
+            {
+                src++;
+                tkn = TOKEN_LE; // Store the token value
+                return; // Exit after processing the token
+            }
+            else if (*src == '<')
+            {
+                src++;
+                tkn = TOKEN_SHL; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                tkn = TOKEN_LT; // Store the token value
+                return; // Exit after processing the token
+            }
+        case '>':
+            if (*src == '=')
+            {
+                src++;
+                tkn = TOKEN_GE; // Store the token value
+                return; // Exit after processing the token
+            }
+            else if (*src == '>')
+            {
+                src++;
+                tkn = TOKEN_SHR; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                tkn = TOKEN_GT; // Store the token value
+                return; // Exit after processing the token
+            }
+        case '&':
+            if (*src == '&')
+            {
+                src++;
+                tkn = TOKEN_AND; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                fprintf(stderr, "Error: Unrecognized token '&'\n");
+                exit(1);
+            }
+        case '|':
+            if (*src == '|')
+            {
+                src++;
+                tkn = TOKEN_LOR; // Store the token value
+                return; // Exit after processing the token
+            }
+            else
+            {
+                fprintf(stderr, "Error: Unrecognized token '|'\n");
+                exit(1);
+            }
+        case '@':
+            src++;
+            tkn = TOKEN_MATMUL; // Store the token value
+            return; // Exit after processing the token
+        case '[':
+        case ']':
+        case '(':
+        case ')':
+        case '{':
+        case '}':
+        case ',':
+        case ';':
+        case ':':
+            return;
+        default:
+            if (tkn >= '0' && tkn <= '9')
+            {
+                tkn_val = tkn - '0';
+                if (tkn == '0' && (*src == 'x' || *src == 'X'))
+                {
+                    src++;
+                    while ((*src >= '0' && *src <= '9') || (*src >= 'a' && *src <= 'f') || (*src >= 'A' && *src <= 'F'))
+                    {
+                        tkn_val = (tkn_val << 4) + (*src >= '0' && *src <= '9' ? *src - '0' : (*src >= 'a' && *src <= 'f' ? *src - 'a' + 10 : *src - 'A' + 10));
+                        src++;
+                    }
+                    /* code to handle hexadecimal number */
+                }
+                else
+                {
+                    while (*src >= '0' && *src <= '9')
+                    {
+                        tkn_val = (tkn_val * 10) + (*src - '0');
+                        src++;
+                    }
+                    /* code to handle decimal number */
+                }
+                tkn = TOKEN_NUM; // Set the token type
+                return; // Exit after processing the number
+            }
+            else if ((tkn >= 'a' && tkn <= 'z') || (tkn >= 'A' && tkn <= 'Z') || tkn == '_')
+            {
+                char* last_pos = src - 1;
+                long hash = tkn;
+                while ((*src >= '0' && *src <= '9') || (*src >= 'a' && *src <= 'z') || (*src >= 'A' && *src <= 'Z') || *src == '_')
+                {
+                    hash = (hash * 0x40) + *src;
+                    src++;
+                }
+                int char_len = src-last_pos;
+                char *name = malloc(char_len+1);
+                memcpy(name, last_pos, char_len);
+                name[char_len] = '\0';
+                tkn_val = name;
+                if (glob_data_find(name))
+                {
+                    /* code to handle existing identifier */
+                    tkn = sym_data_get(name)->tkn; // Set the token type from the existing identifier
+                    return; // Exit after processing the existing identifier
+                }
+                /* code to handle identifiers */
+                sym_data item;
+                item.hash = hash;
+                item.data = NULL; // Initialize data pointer if needed
+                
+                tkn = item.tkn = TOKEN_ID; // Set the token type
+                sym_data_set(name,item);
+                sym_cur = sym_data_get(name);
+                return; // Exit after processing the identifier
+            }
+            else
+            {
+                printf("invalid symbol %c", tkn);
+            }
+            break;
+        }
+    }
+    
+}
\ No newline at end of file
diff --git a/src/tensor-array/interp/token.h b/src/tensor-array/interp/token.h
new file mode 100644
index 0000000..835eaf5
--- /dev/null
+++ b/src/tensor-array/interp/token.h
@@ -0,0 +1,30 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+typedef enum
+{
+    TOKEN_NUM = 0x80, TOKEN_SYS, TOKEN_GLO, TOKEN_LOC, TOKEN_ID,
+    TOKEN_FUNC, TOKEN_ELSE, TOKEN_ENUM, TOKEN_IF, TOKEN_RETURN, TOKEN_SIZEOF,
+    TOKEN_WHILE, TOKEN_ASSIGN, TOKEN_COND, TOKEN_LOR, TOKEN_LAN,
+    TOKEN_OR, TOKEN_XOR, TOKEN_AND, TOKEN_SHL, TOKEN_SHR,
+    TOKEN_EQ, TOKEN_NE, TOKEN_LT, TOKEN_GT, TOKEN_LE, TOKEN_GE, TOKEN_POS, TOKEN_NEG,
+    TOKEN_ADD, TOKEN_SUB, TOKEN_MUL, TOKEN_DIV, TOKEN_MATMUL, TOKEN_INC, TOKEN_NOT
+} TOKEN_TYPE;
+
+void token_next();
+extern int tkn;
+extern int tkn_val;
+extern char *tknname[];
diff --git a/src/tensor-array/interp/vm.c b/src/tensor-array/interp/vm.c
new file mode 100644
index 0000000..edd4a72
--- /dev/null
+++ b/src/tensor-array/interp/vm.c
@@ -0,0 +1,214 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "vm_instruction.h"
+#include "vmop.h"
+#include "vm.h"
+
+VM_INSTRUCTION* orig = NULL;
+VM_INSTRUCTION* pc = NULL;
+
+void eval()
+{
+    printf("vmstart\n");
+    VM_INSTRUCTION_V2 op;
+    pc = orig;
+    while (1)
+    {
+        op = *((VM_INSTRUCTION_V2*)pc++);
+        printf("vmopassign %ld %ld %ld \n", orig, pc, op);
+        switch (op)
+        {
+            case LEA:
+                // Load effective address
+                break;
+            case IMM:
+                // Immediate value
+                any_type = *pc++;
+                any_value = *pc++;
+                op_imm();
+                break;
+            case JMP:
+                // Jump to address
+                pc = (VM_INSTRUCTION*) *pc;
+                break;
+            case CALL:
+                // Function call
+                break;
+            case JZ:
+                // Jump if zero
+                pc = (any_value) ? (VM_INSTRUCTION*)*pc : pc + 1;
+                break;
+            case JNZ:
+                // Jump if not zero
+                pc = (any_value) ? pc + 1 : (VM_INSTRUCTION*)*pc;
+                break;
+            case ENT:
+                // Enter function
+                break;
+            case ADJ:
+                // Adjust stack pointer
+                break;
+            case LEV:
+                // Leave function
+                break;
+            case RET:
+                // Return from function
+                return;
+            case LI:
+                // Load integer
+                break;
+            case LC:
+                // Load character
+                break;
+            case SI:
+                // Store integer
+                break;
+            case SC:
+                // Store character
+                break;
+            case SET:
+                // Set value
+                op_set();
+                break;
+            case GET:
+                // Get value
+                op_get();
+                break;
+            case PUSH:
+                // Push value onto stack
+                op_push();
+                break;
+            case PTR_PUSH:
+                // Push value onto stack
+                op_ptr_push();
+                break;
+            case GETELEM:
+                // Get element from array
+                break;
+            case SETELEM:
+                // Set element in array
+                break;
+            case ADDELEM:
+                // Add element to array
+                break;
+            case OR:
+                // Logical OR operation
+                op_or();
+                break;
+            case XOR:
+                // Logical XOR operation
+                break;
+            case AND:
+                // Logical AND operation
+                op_and();
+                break;
+            case EQ:
+                // Equality check
+                op_eq();
+                break;
+            case NE:
+                // Not equal check
+                op_ne();
+                break;
+            case LT:
+                // Less than check
+                op_lt();
+                break;
+            case GT:
+                // Greater than check
+                op_gt();
+                break;
+            case LE:
+                // Less than or equal check
+                op_le();
+                break;
+            case GE:
+                // Greater than or equal check
+                op_ge();
+                break;
+            case ADD:
+                // Addition operation
+                op_add();
+                break;
+            case SUB:
+                // Subtraction operation
+                op_sub();
+                break;
+            case MUL:
+                // Multiplication operation
+                op_mul();
+                break;
+            case DIV:
+                // Division operation
+                op_div();
+                break;
+            case MATMUL:
+                // Matrix multiplication operation
+                op_matmul();
+                break;
+            case POS:
+                // Unary plus operation
+                op_pos();
+                break;
+            case NEG:
+                // Unary minus operation
+                op_neg();
+                break;
+            case NOT:
+                // Logical NOT operation
+                op_not();
+                break;
+            case SHL:
+                // Shift left operation
+                break;
+            case SHR:
+                // Shift right operation
+                break;
+            case OPEN:
+                // Open file operation
+                op_open();
+                break;
+            case READ:
+                // Read from file operation
+                break;
+            case CLOSE:
+                // Close file operation
+                break;
+            case PRTF:
+                // Print formatted output
+                break;
+            case MALC:
+                // Memory allocation operation
+                break;
+            case MSET:
+                // Memory set operation
+                break;
+            case MCMP:
+                // Memory compare operation
+                break;
+            case EXIT:
+                // Exit operation
+                op_exit();
+                return;
+            default:
+                fprintf(stderr, "Unknown instruction: %d\n", op);
+                exit(1);
+        }
+    }
+}
diff --git a/src/tensor-array/interp/vm.h b/src/tensor-array/interp/vm.h
new file mode 100644
index 0000000..6c1633e
--- /dev/null
+++ b/src/tensor-array/interp/vm.h
@@ -0,0 +1,26 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+typedef enum
+{
+    LEA, IMM, JMP, CALL, JZ, JNZ, ENT, ADJ, LEV, RET, LI, LC, SI, SC, SET, GET, PUSH, PTR_PUSH, GETELEM, SETELEM, ADDELEM,
+    OR, XOR, AND, EQ, NE, LT, GT, LE, GE, ADD, SUB, MUL, DIV, MATMUL, POS, NEG, NOT, SHL, SHR,
+    OPEN, READ, CLOSE, PRTF, MALC, MSET, MCMP, EXIT
+} VM_INSTRUCTION_V2;
+
+void eval();
+
+extern size_t any_value;
diff --git a/src/tensor_array/core/extern_type_map.hh b/src/tensor-array/interp/vm_instruction.h
similarity index 75%
rename from src/tensor_array/core/extern_type_map.hh
rename to src/tensor-array/interp/vm_instruction.h
index 9146a3e..29677f5 100644
--- a/src/tensor_array/core/extern_type_map.hh
+++ b/src/tensor-array/interp/vm_instruction.h
@@ -14,13 +14,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#include <unordered_map>
-#include <typeindex>
-
-namespace tensor_array
+#ifdef __cplusplus
+extern "C"
 {
-    namespace value
-    {
-        extern std::unordered_map<std::type_index, std::size_t> dynamic_type_size;
-    }
+#endif
+    typedef size_t VM_INSTRUCTION;
+    extern VM_INSTRUCTION* orig;
+    extern VM_INSTRUCTION* pc;
+#ifdef __cplusplus
 }
+#endif
\ No newline at end of file
diff --git a/src/tensor-array/interp/vm_type.h b/src/tensor-array/interp/vm_type.h
new file mode 100644
index 0000000..740016a
--- /dev/null
+++ b/src/tensor-array/interp/vm_type.h
@@ -0,0 +1,31 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    typedef enum
+    {
+        TYPE_NONE,
+        TYPE_STRING,
+        TYPE_INT,
+        TYPE_PTR,
+        TYPE_FUNC
+    } VM_TYPE;
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tensor-array/interp/vmop.cc b/src/tensor-array/interp/vmop.cc
new file mode 100644
index 0000000..022e672
--- /dev/null
+++ b/src/tensor-array/interp/vmop.cc
@@ -0,0 +1,317 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stack>
+#include <tensor-array/core/tensor.hh>
+#include <iostream>
+#include <cstring>
+#include "sym_map.h"
+#include "vm_instruction.h"
+#include "vmop.h"
+#include "vm_type.h"
+
+std::stack<tensor_array::value::Tensor> tensor_stack;
+std::stack<std::string> var_stack;
+std::stack<std::pair<VM_INSTRUCTION*, scope>> call_stack;
+tensor_array::value::Tensor ag;
+void* aptr;
+size_t any_value;
+size_t any_type;
+
+void new_int()
+{
+    tensor_array::value::TensorArray<int> tmp2 = {any_value};
+    tensor_array::value::Tensor tmp1(tmp2);
+    ag = tmp1;
+}
+
+void new_ptr()
+{
+    aptr = reinterpret_cast<void*>(any_value);
+}
+
+void new_string()
+{
+    char* str = reinterpret_cast<char*>(any_value);
+    unsigned int s_len = std::strlen(str);
+    tensor_array::value::TensorBase tmp1(typeid(char),{s_len}, str);
+    ag = tmp1;
+    std::free(str);
+}
+
+void op_imm()
+{
+    switch (any_type)
+    {
+    case TYPE_INT:
+        /* code */
+        new_int();
+        break;
+    case TYPE_STRING:
+        new_string();
+        break;
+    case TYPE_PTR:
+        new_ptr();
+        break;
+    default:
+        break;
+    }
+}
+
+void op_add()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag += tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_sub()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag -= tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_mul()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag *= tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_div()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag /= tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_matmul()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = tensor_array::value::matmul(ag, tensor_stack.top());
+    tensor_stack.pop();
+}
+
+void op_pos()
+{
+    ag = +ag;
+}
+
+void op_neg()
+{
+    ag = -ag;
+}
+
+void op_and()
+{
+    ag = ag && tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_or()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = ag || tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_not()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = !ag;
+}
+
+void op_eq()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = ag == tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_ne()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = ag != tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_lt()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = ag < tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_gt()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = ag > tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_le()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = ag <= tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_ge()
+{
+    if (tensor_stack.empty())
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+    ag = ag >= tensor_stack.top();
+    tensor_stack.pop();
+}
+
+void op_shl()
+{
+    // ag = ag << bg;
+}
+
+void op_shr()
+{
+    // ag = ag >> bg;
+}
+
+void op_call()
+{
+    VM_INSTRUCTION* pc1 = (VM_INSTRUCTION*)*pc++;
+    call_stack.push({std::move(pc), std::move(sym_map)});
+    pc = pc1;
+}
+
+void op_ret()
+{
+    pc = call_stack.top().first;
+    sym_map = std::move(call_stack.top().second);
+    call_stack.pop();
+}
+
+void op_open()
+{
+    // Implementation for opening a file or resource
+}
+
+void op_read()
+{
+    // Implementation for reading from a file or resource
+}
+
+void op_close()
+{
+    // Implementation for closing a file or resource
+}
+
+void op_prtf()
+{
+    // Implementation for printing formatted output
+}
+
+void op_malc()
+{
+    // Implementation for memory allocation
+}
+
+void op_mset()
+{
+    // Implementation for setting memory
+}
+
+void op_mcmp()
+{
+    // Implementation for memory comparison
+}
+
+void op_exit()
+{
+    // Implementation for exiting the program
+    // std::cout << ag << std::endl;
+}
+
+void op_push()
+{
+    tensor_stack.push(ag);
+}
+
+void op_ptr_push()
+{
+    var_stack.push(reinterpret_cast<char*>(aptr));
+    std::free(aptr);
+}
+
+void op_get()
+{
+    char *var_name = reinterpret_cast<char*>(aptr);
+    sym_data& temp = sym_map[var_name];
+    std::free(aptr);
+    ag = *reinterpret_cast<tensor_array::value::Tensor*>(temp.data);
+}
+
+void op_set()
+{
+    if (!var_stack.empty())
+    {
+        std::string& var_name = var_stack.top();
+        sym_data& temp = sym_map[var_name];
+        delete temp.data; // Set the top of the stack to ag
+        temp.data = new tensor_array::value::Tensor(ag);
+        var_stack.pop();
+    }
+    else
+    {
+        throw std::runtime_error("Tensor stack is empty");
+    }
+}
diff --git a/src/tensor-array/interp/vmop.h b/src/tensor-array/interp/vmop.h
new file mode 100644
index 0000000..a8a98e0
--- /dev/null
+++ b/src/tensor-array/interp/vmop.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    extern size_t any_value;
+    extern size_t any_type;
+    void op_imm();
+    void op_add();
+    void op_sub();
+    void op_mul();
+    void op_div();
+    void op_matmul();
+    void op_pos();
+    void op_neg();
+    void op_and();
+    void op_or();
+    void op_not();
+    void op_eq();
+    void op_ne();
+    void op_lt();
+    void op_gt();
+    void op_le();
+    void op_ge();
+    void op_shl();
+    void op_shr();
+    void op_open();
+    void op_read();
+    void op_close();
+    void op_prtf();
+    void op_malc();
+    void op_mset();
+    void op_mcmp();
+    void op_exit();
+    void op_push();
+    void op_ptr_push();
+    void op_get();
+    void op_set();
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/src/tensor_array/layers/attention.cc b/src/tensor-array/layers/attention.cc
similarity index 100%
rename from src/tensor_array/layers/attention.cc
rename to src/tensor-array/layers/attention.cc
diff --git a/src/tensor_array/layers/attention.hh b/src/tensor-array/layers/attention.hh
similarity index 73%
rename from src/tensor_array/layers/attention.hh
rename to src/tensor-array/layers/attention.hh
index 8cd7480..dbb4f3c 100644
--- a/src/tensor_array/layers/attention.hh
+++ b/src/tensor-array/layers/attention.hh
@@ -19,13 +19,23 @@ limitations under the License.
 #include "normalization.hh"
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
     namespace layers
     {
-        value::Tensor CUDA_ML_API scaled_dot_product_attention(const value::Tensor&, const value::Tensor&, const value::Tensor&, const value::Tensor& = value::Tensor());
+        value::Tensor TENSOR_ARRAY_API scaled_dot_product_attention(const value::Tensor&, const value::Tensor&, const value::Tensor&, const value::Tensor& = value::Tensor());
 
-        class CUDA_ML_API MultiHeadAttentionImpl final :
+        class TENSOR_ARRAY_API MultiHeadAttentionImpl final :
             public LayerImpl
         {
         private:
@@ -39,4 +49,6 @@ namespace tensor_array
 
         using MultiHeadAttention = LayerHolder<MultiHeadAttentionImpl>;
     }
-}
\ No newline at end of file
+}
+
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/convolution.cc b/src/tensor-array/layers/convolution.cc
similarity index 100%
rename from src/tensor_array/layers/convolution.cc
rename to src/tensor-array/layers/convolution.cc
diff --git a/src/tensor_array/layers/convolution.hh b/src/tensor-array/layers/convolution.hh
similarity index 86%
rename from src/tensor_array/layers/convolution.hh
rename to src/tensor-array/layers/convolution.hh
index 644f1a7..e217a86 100644
--- a/src/tensor_array/layers/convolution.hh
+++ b/src/tensor-array/layers/convolution.hh
@@ -18,11 +18,21 @@ limitations under the License.
 #include "layer_utility.hh"
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
     namespace layers
     {
-        class CUDA_ML_API ConvolutionLayerImpl :
+        class TENSOR_ARRAY_API ConvolutionLayerImpl :
             public TensorCalculateLayerImpl
         {
         protected:
@@ -39,7 +49,7 @@ namespace tensor_array
             value::Tensor calculate(const value::Tensor&) override final;
         };
 
-        class CUDA_ML_API Conv1D_Impl final :
+        class TENSOR_ARRAY_API Conv1D_Impl final :
             public ConvolutionLayerImpl
         {
         public:
@@ -49,7 +59,7 @@ namespace tensor_array
 
         using Conv1D = LayerHolder<Conv1D_Impl>;
 
-        class CUDA_ML_API Conv2D_Impl final :
+        class TENSOR_ARRAY_API Conv2D_Impl final :
             public ConvolutionLayerImpl
         {
         public:
@@ -59,7 +69,7 @@ namespace tensor_array
 
         using Conv2D = LayerHolder<Conv2D_Impl>;
 
-        class CUDA_ML_API Conv3D_Impl final :
+        class TENSOR_ARRAY_API Conv3D_Impl final :
             public ConvolutionLayerImpl
         {
         public:
@@ -71,3 +81,4 @@ namespace tensor_array
     }
 }
 
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/layer_any.cc b/src/tensor-array/layers/layer_any.cc
similarity index 100%
rename from src/tensor_array/layers/layer_any.cc
rename to src/tensor-array/layers/layer_any.cc
diff --git a/src/tensor_array/layers/layer_any.hh b/src/tensor-array/layers/layer_any.hh
similarity index 100%
rename from src/tensor_array/layers/layer_any.hh
rename to src/tensor-array/layers/layer_any.hh
diff --git a/src/tensor_array/layers/layer_holder.hh b/src/tensor-array/layers/layer_holder.hh
similarity index 100%
rename from src/tensor_array/layers/layer_holder.hh
rename to src/tensor-array/layers/layer_holder.hh
diff --git a/src/tensor_array/layers/layer_impl.cc b/src/tensor-array/layers/layer_impl.cc
similarity index 100%
rename from src/tensor_array/layers/layer_impl.cc
rename to src/tensor-array/layers/layer_impl.cc
diff --git a/src/tensor_array/layers/layer_impl.hh b/src/tensor-array/layers/layer_impl.hh
similarity index 87%
rename from src/tensor_array/layers/layer_impl.hh
rename to src/tensor-array/layers/layer_impl.hh
index 7197c38..7852524 100644
--- a/src/tensor_array/layers/layer_impl.hh
+++ b/src/tensor-array/layers/layer_impl.hh
@@ -16,25 +16,26 @@ limitations under the License.
 
 #include <string>
 #include <unordered_map>
-#include <tensor_array/core/tensor.hh>
+#include <tensor-array/core/tensor.hh>
 #include <utility>
-#pragma once
 
-#ifdef __WIN32__
-#ifdef CUDA_ML_EXPORTS
-#define CUDA_ML_API __declspec(dllexport)
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
 #else
-#define CUDA_ML_API __declspec(dllimport)
+#define TENSOR_ARRAY_API __declspec(dllimport)
 #endif
 #else
-#define CUDA_ML_API
+#define TENSOR_ARRAY_API
 #endif
 
+#pragma once
+
 namespace tensor_array
 {
     namespace layers
     {
-        class CUDA_ML_API LayerImpl
+        class TENSOR_ARRAY_API LayerImpl
         {
         private:
             bool is_running = false;
@@ -67,4 +68,6 @@ namespace tensor_array
             public CalculateStruct
         {};
     }
-}
\ No newline at end of file
+}
+
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/layer_utility.cc b/src/tensor-array/layers/layer_utility.cc
similarity index 100%
rename from src/tensor_array/layers/layer_utility.cc
rename to src/tensor-array/layers/layer_utility.cc
diff --git a/src/tensor_array/layers/layer_utility.hh b/src/tensor-array/layers/layer_utility.hh
similarity index 66%
rename from src/tensor_array/layers/layer_utility.hh
rename to src/tensor-array/layers/layer_utility.hh
index a0966d9..5b901fd 100644
--- a/src/tensor_array/layers/layer_utility.hh
+++ b/src/tensor-array/layers/layer_utility.hh
@@ -18,13 +18,23 @@ limitations under the License.
 
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
 	namespace layers
 	{
         typedef value::Tensor(*LayerFunction)(const value::Tensor&);
 
-        class CUDA_ML_API ActivationImpl final :
+        class TENSOR_ARRAY_API ActivationImpl final :
             public TensorCalculateLayerImpl
         {
         public:
@@ -34,7 +44,7 @@ namespace tensor_array
             const LayerFunction func;
         };
 
-        class CUDA_ML_API ReShapeImpl final :
+        class TENSOR_ARRAY_API ReShapeImpl final :
             public TensorCalculateLayerImpl
         {
         public:
@@ -44,13 +54,15 @@ namespace tensor_array
             const std::vector<unsigned int> shape;
         };
 
-        value::Tensor CUDA_ML_API NoActivation(const value::Tensor&);
-        value::Tensor CUDA_ML_API ReLU(const value::Tensor&);
-        value::Tensor CUDA_ML_API tanh(const value::Tensor&);
-        value::Tensor CUDA_ML_API Sigmoid(const value::Tensor&);
-        value::Tensor CUDA_ML_API SoftMax(const value::Tensor&, unsigned char dim);
+        value::Tensor TENSOR_ARRAY_API NoActivation(const value::Tensor&);
+        value::Tensor TENSOR_ARRAY_API ReLU(const value::Tensor&);
+        value::Tensor TENSOR_ARRAY_API tanh(const value::Tensor&);
+        value::Tensor TENSOR_ARRAY_API Sigmoid(const value::Tensor&);
+        value::Tensor TENSOR_ARRAY_API SoftMax(const value::Tensor&, unsigned char dim);
 
         using Activation = LayerHolder<ActivationImpl>;
         using ReShape = LayerHolder<ReShapeImpl>;
 	}
 }
+
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/linear.cc b/src/tensor-array/layers/linear.cc
similarity index 100%
rename from src/tensor_array/layers/linear.cc
rename to src/tensor-array/layers/linear.cc
diff --git a/src/tensor_array/layers/linear.hh b/src/tensor-array/layers/linear.hh
similarity index 82%
rename from src/tensor_array/layers/linear.hh
rename to src/tensor-array/layers/linear.hh
index 6d87fdd..45cbc4e 100644
--- a/src/tensor_array/layers/linear.hh
+++ b/src/tensor-array/layers/linear.hh
@@ -17,11 +17,21 @@ limitations under the License.
 #include "layer_holder.hh"
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
 	namespace layers
 	{
-        class CUDA_ML_API LinearImpl final :
+        class TENSOR_ARRAY_API LinearImpl final :
             public TensorCalculateLayerImpl
         {
         private:
@@ -43,4 +53,4 @@ namespace tensor_array
 	}
 }
 
-
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/normalization.cc b/src/tensor-array/layers/normalization.cc
similarity index 100%
rename from src/tensor_array/layers/normalization.cc
rename to src/tensor-array/layers/normalization.cc
diff --git a/src/tensor_array/layers/normalization.hh b/src/tensor-array/layers/normalization.hh
similarity index 82%
rename from src/tensor_array/layers/normalization.hh
rename to src/tensor-array/layers/normalization.hh
index 1afa8b6..6d8a8f7 100644
--- a/src/tensor_array/layers/normalization.hh
+++ b/src/tensor-array/layers/normalization.hh
@@ -17,11 +17,21 @@ limitations under the License.
 #include "layer_holder.hh"
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
 	namespace layers
 	{
-        class CUDA_ML_API NormalizationImpl final :
+        class TENSOR_ARRAY_API NormalizationImpl final :
             public TensorCalculateLayerImpl
         {
         private:
@@ -39,3 +49,4 @@ namespace tensor_array
 	}
 }
 
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/recurrent.cc b/src/tensor-array/layers/recurrent.cc
similarity index 100%
rename from src/tensor_array/layers/recurrent.cc
rename to src/tensor-array/layers/recurrent.cc
diff --git a/src/tensor_array/layers/recurrent.hh b/src/tensor-array/layers/recurrent.hh
similarity index 86%
rename from src/tensor_array/layers/recurrent.hh
rename to src/tensor-array/layers/recurrent.hh
index 1041e92..721cd2a 100644
--- a/src/tensor_array/layers/recurrent.hh
+++ b/src/tensor-array/layers/recurrent.hh
@@ -18,11 +18,21 @@ limitations under the License.
 #include "layer_utility.hh"
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
 	namespace layers
 	{
-        class CUDA_ML_API RecurrentImpl :
+        class TENSOR_ARRAY_API RecurrentImpl :
             public TensorCalculateLayerImpl
         {
         private:
@@ -41,7 +51,7 @@ namespace tensor_array
         };
         using Recurrent = LayerHolder<RecurrentImpl>;
 
-        class CUDA_ML_API LSTM_Impl :
+        class TENSOR_ARRAY_API LSTM_Impl :
             public TensorCalculateLayerImpl
         {
         private:
@@ -60,3 +70,4 @@ namespace tensor_array
 	}
 }
 
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/sequential.cc b/src/tensor-array/layers/sequential.cc
similarity index 100%
rename from src/tensor_array/layers/sequential.cc
rename to src/tensor-array/layers/sequential.cc
diff --git a/src/tensor_array/layers/sequential.hh b/src/tensor-array/layers/sequential.hh
similarity index 79%
rename from src/tensor_array/layers/sequential.hh
rename to src/tensor-array/layers/sequential.hh
index db7854a..cbe95e0 100644
--- a/src/tensor_array/layers/sequential.hh
+++ b/src/tensor-array/layers/sequential.hh
@@ -17,13 +17,23 @@ limitations under the License.
 #include "layer_holder.hh"
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
 	namespace layers
 	{
         using LayerInSequential = LayerHolder<TensorCalculateLayerImpl>;
 
-        class CUDA_ML_API SequentialImpl final :
+        class TENSOR_ARRAY_API SequentialImpl final :
             public TensorCalculateLayerImpl
         {
         private:
@@ -37,7 +47,7 @@ namespace tensor_array
             value::Tensor calculate(const value::Tensor&) override;
         };
 
-        class CUDA_ML_API Sequential : public LayerHolder<SequentialImpl>
+        class TENSOR_ARRAY_API Sequential : public LayerHolder<SequentialImpl>
         {
         public:
             Sequential() = default;
@@ -46,3 +56,4 @@ namespace tensor_array
 	}
 }
 
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/layers/transformer.cc b/src/tensor-array/layers/transformer.cc
similarity index 100%
rename from src/tensor_array/layers/transformer.cc
rename to src/tensor-array/layers/transformer.cc
diff --git a/src/tensor_array/layers/transformer.hh b/src/tensor-array/layers/transformer.hh
similarity index 87%
rename from src/tensor_array/layers/transformer.hh
rename to src/tensor-array/layers/transformer.hh
index c4bac31..a2c8257 100644
--- a/src/tensor_array/layers/transformer.hh
+++ b/src/tensor-array/layers/transformer.hh
@@ -17,11 +17,21 @@ limitations under the License.
 #include "attention.hh"
 #pragma once
 
+#ifdef _WIN32
+#ifdef TENSOR_ARRAY_LAYERS_EXPORTS
+#define TENSOR_ARRAY_API __declspec(dllexport)
+#else
+#define TENSOR_ARRAY_API __declspec(dllimport)
+#endif
+#else
+#define TENSOR_ARRAY_API
+#endif
+
 namespace tensor_array
 {
     namespace layers
     {
-        class CUDA_ML_API TransformerEncoderImpl final :
+        class TENSOR_ARRAY_API TransformerEncoderImpl final :
             public TensorCalculateLayerImpl
         {
         private:
@@ -37,7 +47,7 @@ namespace tensor_array
 
         using TransformerEncoder = LayerHolder<TransformerEncoderImpl>;
 
-        class CUDA_ML_API TransformerDecoderImpl final :
+        class TENSOR_ARRAY_API TransformerDecoderImpl final :
             public LayerImpl
         {
         private:
@@ -70,3 +80,4 @@ namespace tensor_array
     }
 }
 
+#undef TENSOR_ARRAY_API
diff --git a/src/tensor_array/core/CMakeLists.txt b/src/tensor_array/core/CMakeLists.txt
deleted file mode 100644
index fe9cede..0000000
--- a/src/tensor_array/core/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-cmake_minimum_required(VERSION 3.8.0)
-
-enable_language(CUDA C CXX)
-
-set(CMAKE_CUDA_COMPILER nvcc)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_ARCHITECTURES 52 75 89)
-set(CMAKE_CUDA_SEPARABLE_COMPILATION TRUE)
-set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")  # enable cuda-gdb
-
-file(GLOB TensorArray_src "*.cc" "*.cu")
-file(GLOB TensorArray_inc "*.hh")
-
-# file(MAKE_DIRECTORY "include/tensor_array/core")
-
-install(
-    FILES ${TensorArray_inc}
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/tensor_array/core
-    COMPONENT headers)
-
-add_library(tensorarray_core SHARED ${TensorArray_src})
-add_library(TensorArray::Core ALIAS tensorarray_core)
-target_link_libraries(tensorarray_core PRIVATE cublas)
-
-install(
-    TARGETS tensorarray_core
-    EXPORT TensorArrayTargets
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    COMPONENT libraries
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    COMPONENT libraries)
diff --git a/src/tensor_array/core/initializer_wrapper.hh b/src/tensor_array/core/initializer_wrapper.hh
deleted file mode 100644
index 7e8e116..0000000
--- a/src/tensor_array/core/initializer_wrapper.hh
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
-Copyright 2024 TensorArray-Creators
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include <initializer_list>
-
-namespace tensor_array
-{
-    namespace wrapper
-    {
-        template<class _E>
-    class initializer_wrapper
-    {
-    public:
-        typedef _E 		value_type;
-        typedef const _E& 	reference;
-        typedef const _E& 	const_reference;
-        typedef size_t 		size_type;
-        typedef const _E* 	iterator;
-        typedef const _E* 	const_iterator;
-
-    private:
-#ifdef __GNUC__
-        iterator			_M_array;
-        size_type			_M_len;
-#endif
-
-    public:
-        constexpr initializer_wrapper(const_iterator __a, size_type __l)
-        : _M_array(__a), _M_len(__l) { }
-
-        constexpr initializer_wrapper(const_iterator __begin, const_iterator __end)
-        : _M_array(__begin), _M_len(__end - __begin) { }
-        
-        constexpr initializer_wrapper() noexcept: _M_array(0), _M_len(0) { }
-        
-        // Number of elements.
-        constexpr size_type
-        size() const noexcept { return _M_len; }
-        
-        // First element.
-        constexpr const_iterator
-        begin() const noexcept { return _M_array; }
-        
-        // One past the last element.
-        constexpr const_iterator
-        end() const noexcept { return begin() + size(); }
-
-        constexpr operator std::initializer_list<_E>() const { return reinterpret_cast<const std::initializer_list<_E>&>(*this); }
-    };
-    }
-}
\ No newline at end of file
diff --git a/src/tensor_array/layers/CMakeLists.txt b/src/tensor_array/layers/CMakeLists.txt
deleted file mode 100644
index cef320d..0000000
--- a/src/tensor_array/layers/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-cmake_minimum_required(VERSION 3.8.0)
-
-enable_language(C CXX)
-
-file(GLOB TensorArray_src "*.cc")
-file(GLOB TensorArray_inc "*.hh")
-
-install(
-    FILES ${TensorArray_inc}
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/tensor_array/layers
-    COMPONENT headers)
-
-add_library(tensorarray_layers SHARED ${TensorArray_src})
-add_library(TensorArray::Layers ALIAS tensorarray_layers)
-
-target_include_directories(tensorarray_layers PRIVATE ${PROJECT_SOURCE_DIR}/src)
-target_link_libraries(tensorarray_layers TensorArray::Core)
-
-install(
-    TARGETS tensorarray_layers
-    EXPORT TensorArrayTargets
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    COMPONENT libraries
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    COMPONENT libraries)
diff --git a/tests/_ b/tests/_
new file mode 100644
index 0000000..e69de29
diff --git a/tests/tensor-array/core/CMakeLists.txt b/tests/tensor-array/core/CMakeLists.txt
new file mode 100644
index 0000000..26cc477
--- /dev/null
+++ b/tests/tensor-array/core/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.18)
+
+# project(TensorArray_tests)
+
+include(cmake/ta_core_tests.cmake)
diff --git a/tests/tensor-array/core/cmake/ta_core_tests.cmake b/tests/tensor-array/core/cmake/ta_core_tests.cmake
new file mode 100644
index 0000000..da6401d
--- /dev/null
+++ b/tests/tensor-array/core/cmake/ta_core_tests.cmake
@@ -0,0 +1,25 @@
+
+set(
+    TensorArray_tests_src
+    "tensor_array_test.cc"
+    "print_output.cc"
+    # "tensor_operators.cc"
+    # "tensor_matmul_transpose.cc"
+    # "gradient.cc"
+    )
+
+enable_testing()
+
+create_test_sourcelist(
+    TensorArray_tests
+    "test_driver.cc"
+    ${TensorArray_tests_src})
+
+add_executable(tensorarray_core_tests ${TensorArray_tests})
+target_include_directories(tensorarray_core_tests PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_link_libraries(tensorarray_core_tests PUBLIC TensorArray::core)
+
+foreach(test ${TensorArray_tests_src})
+    get_filename_component(TName ${test} NAME_WE)
+    add_test(NAME ${TName} COMMAND tensorarray_core_tests ${TName})
+endforeach()
diff --git a/tests/tensor-array/core/gradient.cc b/tests/tensor-array/core/gradient.cc
new file mode 100644
index 0000000..fbb5768
--- /dev/null
+++ b/tests/tensor-array/core/gradient.cc
@@ -0,0 +1,41 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <tensor-array/core/tensor.hh>
+
+using namespace std;
+using namespace tensor_array::value;
+
+int gradient(int argc, char *argv[])
+{
+    TensorArray<float, 4, 4> example_tensor_array =
+    {{
+        {{ 1, 2, 3, 4 }},
+        {{ 5, 6, 7, 8 }},
+        {{ 9, 10, 11, 12 }},
+        {{ 13, 14, 15, 16 }},
+    }};
+    TensorArray<float> example_tensor_array_scalar = {100};
+    Tensor example_tensor_1(example_tensor_array);
+    Tensor example_tensor_2(example_tensor_array_scalar);
+    Tensor example_tensor_sum = example_tensor_1 + example_tensor_2;
+    cout << example_tensor_sum << endl;
+    example_tensor_sum.calc_grad();
+    cout << example_tensor_1.get_grad() << endl;
+    cout << example_tensor_2.get_grad() << endl;
+    return 0;
+}
diff --git a/tests/tensor-array/core/print_output.cc b/tests/tensor-array/core/print_output.cc
new file mode 100644
index 0000000..ad1c9be
--- /dev/null
+++ b/tests/tensor-array/core/print_output.cc
@@ -0,0 +1,36 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <tensor-array/core/tensor.hh>
+
+using namespace std;
+using namespace tensor_array::value;
+
+int print_output(int argc, char *argv[])
+{
+    /* code */
+    TensorArray<float, 4, 4> example_tensor_array =
+    {{
+        {{ 1, 2, 3, 4 }},
+        {{ 5, 6, 7, 8 }},
+        {{ 9, 10, 11, 12 }},
+        {{ 13, 14, 15, 16 }},
+    }};
+    Tensor example_tensor_1(example_tensor_array);
+    cout << example_tensor_1 << endl;
+    return 0;
+}
diff --git a/tests/tensor-array/core/tensor_array_test.cc b/tests/tensor-array/core/tensor_array_test.cc
new file mode 100644
index 0000000..039ed01
--- /dev/null
+++ b/tests/tensor-array/core/tensor_array_test.cc
@@ -0,0 +1,31 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <tensor-array/core/tensorbase.hh>
+
+using namespace tensor_array::value;
+
+int tensor_array_test(int argc, char *argv[])
+{
+    TensorArray<float, 4, 4> example_tensor_array =
+    {{
+        {{ 1, 2, 3, 4 }},
+        {{ 5, 6, 7, 8 }},
+        {{ 9, 10, 11, 12 }},
+        {{ 13, 14, 15, 16 }},
+    }};
+    return 0;
+}
diff --git a/tests/tensor-array/core/tensor_matmul_transpose.cc b/tests/tensor-array/core/tensor_matmul_transpose.cc
new file mode 100644
index 0000000..f101971
--- /dev/null
+++ b/tests/tensor-array/core/tensor_matmul_transpose.cc
@@ -0,0 +1,33 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <tensor-array/core/tensor.hh>
+
+using namespace tensor_array::value;
+
+int tensor_matmul_transpose(int argc, char *argv[])
+{
+    TensorArray<float, 2, 3> example_tensor_array =
+    {{
+        {{ 1, 2, 3 }},
+        {{ 4, 5, 6 }}
+    }};
+    TensorArray<float> example_tensor_array_scalar = {100};
+    Tensor example_tensor_1(example_tensor_array);
+    Tensor example_tensor_2 = example_tensor_1.transpose(0, 1);
+    Tensor example_tensor_add = matmul(example_tensor_1, example_tensor_2);
+    return 0;
+}
diff --git a/tests/tensor-array/core/tensor_operators.cc b/tests/tensor-array/core/tensor_operators.cc
new file mode 100644
index 0000000..9a840d4
--- /dev/null
+++ b/tests/tensor-array/core/tensor_operators.cc
@@ -0,0 +1,39 @@
+/*
+Copyright 2024 TensorArray-Creators
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <tensor-array/core/tensor.hh>
+
+using namespace tensor_array::value;
+
+int tensor_operators(int argc, char *argv[])
+{
+    TensorArray<float, 4, 4> example_tensor_array =
+    {{
+        {{ 1, 2, 3, 4 }},
+        {{ 5, 6, 7, 8 }},
+        {{ 9, 10, 11, 12 }},
+        {{ 13, 14, 15, 16 }},
+    }};
+    TensorArray<float> example_tensor_array_scalar = {100};
+    Tensor example_tensor_1(example_tensor_array);
+    Tensor example_tensor_2(example_tensor_array_scalar);
+    Tensor example_tensor_add = example_tensor_1 + example_tensor_2;
+    Tensor example_tensor_sub = example_tensor_1 - example_tensor_2;
+    Tensor example_tensor_mul = example_tensor_1 * example_tensor_2;
+    Tensor example_tensor_div = example_tensor_1 + example_tensor_2;
+    return 0;
+}
+
diff --git a/tests/tensor-array/layers/_ b/tests/tensor-array/layers/_
new file mode 100644
index 0000000..e69de29