diff --git a/.gitignore b/.gitignore index 1dde2d2..e93fb07 100644 --- a/.gitignore +++ b/.gitignore @@ -1,817 +1,817 @@ -# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig - -# Created by https://www.gitignore.io/api/visualstudiocode,windows,c,c++,cmake,codeblocks,intellij+all,jetbrains+all,kdevelop4,linux,macos,osx,pycharm+all,vim,visualstudio,xcode -# Edit at https://www.gitignore.io/?templates=visualstudiocode,windows,c,c++,cmake,codeblocks,intellij+all,jetbrains+all,kdevelop4,linux,macos,osx,pycharm+all,vim,visualstudio,xcode - -### C ### -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -*.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod* -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf - -### C++ ### -# Prerequisites - -# Compiled Object files -*.slo - -# Precompiled Headers - -# Compiled Dynamic libraries - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai - -# Executables - -### CMake ### -CMakeCache.txt -CMakeFiles -CMakeScripts -Testing -Makefile -cmake_install.cmake -install_manifest.txt -compile_commands.json -CTestTestfile.cmake - -### CodeBlocks ### -# specific to CodeBlocks IDE -*.layout -*.depend -*.cbp -# generated directories -bin/ -obj/ - -### Intellij+all ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Generated files -.idea/**/contentModel.xml - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/httpRequests - -# Android studio 3.1+ serialized cache file -.idea/caches/build_file_checksums.ser - -### Intellij+all Patch ### -# Ignores the whole .idea folder and all .iml files -# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 - -.idea/ - -# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 - -*.iml -modules.xml -.idea/misc.xml -*.ipr - -### JetBrains+all ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff - -# Generated files - -# Sensitive or high-churn files - -# Gradle - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake - -# Mongo Explorer plugin - -# File-based project format - -# IntelliJ - -# mpeltonen/sbt-idea plugin - -# JIRA plugin - -# Cursive Clojure plugin - -# Crashlytics plugin (for Android Studio and IntelliJ) - -# Editor-based Rest Client - -# Android studio 3.1+ serialized cache file - -### JetBrains+all Patch ### -# Ignores the whole .idea folder and all .iml files -# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 - - -# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 - -### VSCode ### -.vscode/* -!.vscode/settings.json - -### KDevelop4 ### -*.kdev4 -.kdev4/ - -### Linux ### -*~ - -# temporary files which can be created if a process still has a handle open of a deleted file -.fuse_hidden* - -# KDE directory preferences -.directory - -# Linux trash folder which might appear on any partition or disk -.Trash-* - -# .nfs files are created when an open file is removed but is still being accessed -.nfs* - -### macOS ### -# General -.DS_Store -.AppleDouble -.LSOverride - -# Icon must end with two \r -Icon - -# Thumbnails -._* - -# Files that might appear in the root of a volume -.DocumentRevisions-V100 -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns -.com.apple.timemachine.donotpresent - -# Directories potentially created on remote AFP share -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk - -### OSX ### -# General - -# Icon must end with two \r - -# Thumbnails - -# Files that might appear in the root of a volume - -# Directories potentially created on remote AFP share - -### PyCharm+all ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff - -# Generated files - -# Sensitive or high-churn files - -# Gradle - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake - -# Mongo Explorer plugin - -# File-based project format - -# IntelliJ - -# mpeltonen/sbt-idea plugin - -# JIRA plugin - -# Cursive Clojure plugin - -# Crashlytics plugin (for Android Studio and IntelliJ) - -# Editor-based Rest Client - -# Android studio 3.1+ serialized cache file - -### PyCharm+all Patch ### -# Ignores the whole .idea folder and all .iml files -# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 - - -# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 - - -### Vim ### -# Swap -[._]*.s[a-v][a-z] -[._]*.sw[a-p] -[._]s[a-rt-v][a-z] -[._]ss[a-gi-z] -[._]sw[a-p] - -# Session -Session.vim - -# Temporary -.netrwhist -# Auto-generated tag files -tags -# Persistent undo -[._]*.un~ - -### VisualStudioCode ### -.vscode/* -!.vscode/settings.json -!.vscode/tasks.json -!.vscode/launch.json -!.vscode/extensions.json - -### Windows ### -# Windows thumbnail cache files -Thumbs.db -ehthumbs.db -ehthumbs_vista.db - -# Dump file -*.stackdump - -# Folder config file -[Dd]esktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msix -*.msm -*.msp - -# Windows shortcuts -*.lnk - -### Xcode ### -# Xcode -# -# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore - -## Build generated -build/ -DerivedData/ - -## Various settings -*.pbxuser -!default.pbxuser -*.mode1v3 -!default.mode1v3 -*.mode2v3 -!default.mode2v3 -*.perspectivev3 -!default.perspectivev3 -xcuserdata/ - -## Other -*.moved-aside -*.xccheckout -*.xcscmblueprint - -## Obj-C/Swift specific -*.hmap -*.ipa -*.dSYM.zip -*.dSYM - -## Playgrounds -timeline.xctimeline -playground.xcworkspace - -# Swift Package Manager -# -# Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. -# Packages/ -# Package.pins -# Package.resolved -.build/ - -# CocoaPods -# -# We recommend against adding the Pods directory to your .gitignore. However -# you should judge for yourself, the pros and cons are mentioned at: -# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control -# -# Pods/ -# -# Add this line if you want to avoid checking in source code from the Xcode workspace -# *.xcworkspace - -# Carthage -# -# Add this line if you want to avoid checking in source code from Carthage dependencies. -# Carthage/Checkouts - -Carthage/Build - -# fastlane -# -# It is recommended to not store the screenshots in the git repo. Instead, use fastlane to re-generate the -# screenshots whenever they are needed. -# For more information about the recommended setup visit: -# https://docs.fastlane.tools/best-practices/source-control/#source-control - -fastlane/report.xml -fastlane/Preview.html -fastlane/screenshots/**/*.png -fastlane/test_output - -# Code Injection -# -# After new code Injection tools there's a generated folder /iOSInjectionProject -# https://github.com/johnno1962/injectionforxcode - -iOSInjectionProject/ - - -### Xcode Patch ### -*.xcodeproj/* -!*.xcodeproj/project.pbxproj -!*.xcodeproj/xcshareddata/ -!*.xcworkspace/contents.xcworkspacedata -/*.gcno -**/xcshareddata/WorkspaceSettings.xcsettings - -### VisualStudio ### -## Ignore Visual Studio temporary files, build results, and -## files generated by popular Visual Studio add-ons. -## -## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore - -# User-specific files -*.rsuser -*.suo -*.user -*.userosscache -*.sln.docstates - -# User-specific files (MonoDevelop/Xamarin Studio) -*.userprefs - -# Build results -[Dd]ebug/ -[Dd]ebugPublic/ -[Rr]elease/ -[Rr]eleases/ -x64/ -x86/ -bld/ -[Bb]in/ -[Oo]bj/ -[Ll]og/ - -# Visual Studio 2015/2017 cache/options directory -.vs/ -# Uncomment if you have tasks that create the project's static files in wwwroot -#wwwroot/ - -# Visual Studio 2017 auto generated files -Generated\ Files/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -# NUNIT -*.VisualState.xml -TestResult.xml - -# Build Results of an ATL Project -[Dd]ebugPS/ -[Rr]eleasePS/ -dlldata.c - -# Benchmark Results -BenchmarkDotNet.Artifacts/ - -# .NET Core -project.lock.json -project.fragment.lock.json -artifacts/ - -# StyleCop -StyleCopReport.xml - -# Files built by Visual Studio -*_i.c -*_p.c -*_h.h -*.meta -*.iobj -*.ipdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*_wpftmp.csproj -*.log -*.vspscc -*.vssscc -.builds -*.pidb -*.svclog -*.scc - -# Chutzpah Test files -_Chutzpah* - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opendb -*.opensdf -*.sdf -*.cachefile -*.VC.db -*.VC.VC.opendb - -# Visual Studio profiler -*.psess -*.vsp -*.vspx -*.sap - -# Visual Studio Trace Files -*.e2e - -# TFS 2012 Local Workspace -$tf/ - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper -*.DotSettings.user - -# JustCode is a .NET coding add-in -.JustCode - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# AxoCover is a Code Coverage Tool -.axoCover/* -!.axoCover/settings.json - -# Visual Studio code coverage results -*.coverage -*.coveragexml - -# NCrunch -_NCrunch_* -.*crunch*.local.xml -nCrunchTemp_* - -# MightyMoose -*.mm.* -AutoTest.Net/ - -# Web workbench (sass) -.sass-cache/ - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.[Pp]ublish.xml -*.azurePubxml -# Note: Comment the next line if you want to checkin your web deploy settings, -# but database connection strings (with potential passwords) will be unencrypted -*.pubxml -*.publishproj - -# Microsoft Azure Web App publish settings. Comment the next line if you want to -# checkin your Azure Web App publish settings, but sensitive information contained -# in these scripts will be unencrypted -PublishScripts/ - -# NuGet Packages -*.nupkg -# The packages folder can be ignored because of Package Restore -**/[Pp]ackages/* -# except build/, which is used as an MSBuild target. -!**/[Pp]ackages/build/ -# Uncomment if necessary however generally it will be regenerated when needed -#!**/[Pp]ackages/repositories.config -# NuGet v3's project.json files produces more ignorable files -*.nuget.props -*.nuget.targets - -# Microsoft Azure Build Output -csx/ -*.build.csdef - -# Microsoft Azure Emulator -ecf/ -rcf/ - -# Windows Store app package directories and files -AppPackages/ -BundleArtifacts/ -Package.StoreAssociation.xml -_pkginfo.txt -*.appx - -# Visual Studio cache files -# files ending in .cache can be ignored -*.[Cc]ache -# but keep track of directories ending in .cache -!*.[Cc]ache/ - -# Others -ClientBin/ -~$* -*.dbmdl -*.dbproj.schemaview -*.jfm -*.pfx -*.publishsettings -orleans.codegen.cs - -# Including strong name files can present a security risk -# (https://github.com/github/gitignore/pull/2483#issue-259490424) -#*.snk - -# Since there are multiple workflows, uncomment next line to ignore bower_components -# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) -#bower_components/ - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file -# to a newer Visual Studio version. Backup files are not needed, -# because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm -ServiceFabricBackup/ -*.rptproj.bak - -# SQL Server files -*.mdf -*.ldf -*.ndf - -# Business Intelligence projects -*.rdl.data -*.bim.layout -*.bim_*.settings -*.rptproj.rsuser - -# Microsoft Fakes -FakesAssemblies/ - -# GhostDoc plugin setting file -*.GhostDoc.xml - -# Node.js Tools for Visual Studio -.ntvs_analysis.dat -node_modules/ - -# Visual Studio 6 build log -*.plg - -# Visual Studio 6 workspace options file -*.opt - -# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) -*.vbw - -# Visual Studio LightSwitch build output -**/*.HTMLClient/GeneratedArtifacts -**/*.DesktopClient/GeneratedArtifacts -**/*.DesktopClient/ModelManifest.xml -**/*.Server/GeneratedArtifacts -**/*.Server/ModelManifest.xml -_Pvt_Extensions - -# Paket dependency manager -.paket/paket.exe -paket-files/ - -# FAKE - F# Make -.fake/ - -# JetBrains Rider -*.sln.iml - -# CodeRush personal settings -.cr/personal - -# Python Tools for Visual Studio (PTVS) -__pycache__/ -*.pyc - -# Cake - Uncomment if you are using it -# tools/** -# !tools/packages.config - -# Tabs Studio -*.tss - -# Telerik's JustMock configuration file -*.jmconfig - -# BizTalk build output -*.btp.cs -*.btm.cs -*.odx.cs -*.xsd.cs - -# OpenCover UI analysis results -OpenCover/ - -# Azure Stream Analytics local run output -ASALocalRun/ - -# MSBuild Binary and Structured Log -*.binlog - -# NVidia Nsight GPU debugger configuration file -*.nvuser - -# MFractors (Xamarin productivity tool) working folder -.mfractor/ - -# Local History for Visual Studio -.localhistory/ - -# End of https://www.gitignore.io/api/visualstudiocode,windows,c,c++,cmake,codeblocks,intellij+all,jetbrains+all,kdevelop4,linux,macos,osx,pycharm+all,vim,visualstudio,xcode - -# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) - -kendryte-standlone-sdk.si4project/ - -kendryte-standalone-demo -kendryte-standalone-demo-bak -src/ -!src/hello_world -/CMakeSettings.json -/build_i +# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig + +# Created by https://www.gitignore.io/api/visualstudiocode,windows,c,c++,cmake,codeblocks,intellij+all,jetbrains+all,kdevelop4,linux,macos,osx,pycharm+all,vim,visualstudio,xcode +# Edit at https://www.gitignore.io/?templates=visualstudiocode,windows,c,c++,cmake,codeblocks,intellij+all,jetbrains+all,kdevelop4,linux,macos,osx,pycharm+all,vim,visualstudio,xcode + +### C ### +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +### C++ ### +# Prerequisites + +# Compiled Object files +*.slo + +# Precompiled Headers + +# Compiled Dynamic libraries + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai + +# Executables + +### CMake ### +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake + +### CodeBlocks ### +# specific to CodeBlocks IDE +*.layout +*.depend +*.cbp +# generated directories +bin/ +obj/ + +### Intellij+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +*.iml +modules.xml +.idea/misc.xml +*.ipr + +### JetBrains+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff + +# Generated files + +# Sensitive or high-churn files + +# Gradle + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake + +# Mongo Explorer plugin + +# File-based project format + +# IntelliJ + +# mpeltonen/sbt-idea plugin + +# JIRA plugin + +# Cursive Clojure plugin + +# Crashlytics plugin (for Android Studio and IntelliJ) + +# Editor-based Rest Client + +# Android studio 3.1+ serialized cache file + +### JetBrains+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +### VSCode ### +.vscode/* +!.vscode/settings.json + +### KDevelop4 ### +*.kdev4 +.kdev4/ + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### OSX ### +# General + +# Icon must end with two \r + +# Thumbnails + +# Files that might appear in the root of a volume + +# Directories potentially created on remote AFP share + +### PyCharm+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff + +# Generated files + +# Sensitive or high-churn files + +# Gradle + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake + +# Mongo Explorer plugin + +# File-based project format + +# IntelliJ + +# mpeltonen/sbt-idea plugin + +# JIRA plugin + +# Cursive Clojure plugin + +# Crashlytics plugin (for Android Studio and IntelliJ) + +# Editor-based Rest Client + +# Android studio 3.1+ serialized cache file + +### PyCharm+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +### Xcode ### +# Xcode +# +# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore + +## Build generated +build/ +DerivedData/ + +## Various settings +*.pbxuser +!default.pbxuser +*.mode1v3 +!default.mode1v3 +*.mode2v3 +!default.mode2v3 +*.perspectivev3 +!default.perspectivev3 +xcuserdata/ + +## Other +*.moved-aside +*.xccheckout +*.xcscmblueprint + +## Obj-C/Swift specific +*.hmap +*.ipa +*.dSYM.zip +*.dSYM + +## Playgrounds +timeline.xctimeline +playground.xcworkspace + +# Swift Package Manager +# +# Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. +# Packages/ +# Package.pins +# Package.resolved +.build/ + +# CocoaPods +# +# We recommend against adding the Pods directory to your .gitignore. However +# you should judge for yourself, the pros and cons are mentioned at: +# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control +# +# Pods/ +# +# Add this line if you want to avoid checking in source code from the Xcode workspace +# *.xcworkspace + +# Carthage +# +# Add this line if you want to avoid checking in source code from Carthage dependencies. +# Carthage/Checkouts + +Carthage/Build + +# fastlane +# +# It is recommended to not store the screenshots in the git repo. Instead, use fastlane to re-generate the +# screenshots whenever they are needed. +# For more information about the recommended setup visit: +# https://docs.fastlane.tools/best-practices/source-control/#source-control + +fastlane/report.xml +fastlane/Preview.html +fastlane/screenshots/**/*.png +fastlane/test_output + +# Code Injection +# +# After new code Injection tools there's a generated folder /iOSInjectionProject +# https://github.com/johnno1962/injectionforxcode + +iOSInjectionProject/ + + +### Xcode Patch ### +*.xcodeproj/* +!*.xcodeproj/project.pbxproj +!*.xcodeproj/xcshareddata/ +!*.xcworkspace/contents.xcworkspacedata +/*.gcno +**/xcshareddata/WorkspaceSettings.xcsettings + +### VisualStudio ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.meta +*.iobj +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# JetBrains Rider +*.sln.iml + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# End of https://www.gitignore.io/api/visualstudiocode,windows,c,c++,cmake,codeblocks,intellij+all,jetbrains+all,kdevelop4,linux,macos,osx,pycharm+all,vim,visualstudio,xcode + +# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) + +kendryte-standlone-sdk.si4project/ + +kendryte-standalone-demo +kendryte-standalone-demo-bak +src/ +!src/hello_world +/CMakeSettings.json +/build_i diff --git a/cmake/common.cmake b/cmake/common.cmake index cfac06f..a30602e 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -22,6 +22,11 @@ ENDIF () # definitions in macros add_definitions(-DCONFIG_LOG_LEVEL=LOG_VERBOSE -DCONFIG_LOG_ENABLE -DCONFIG_LOG_COLORS -DLOG_KERNEL -D__riscv64 -DLV_CONF_INCLUDE_SIMPLE) +# xtl options +add_definitions(-DTCB_SPAN_NO_EXCEPTIONS -DTCB_SPAN_NO_CONTRACT_CHECKING) +# nncase options +add_definitions(-DNNCASE_TARGET=k210) + if (NOT SDK_ROOT) get_filename_component(_SDK_ROOT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY) global_set(SDK_ROOT ${_SDK_ROOT}) diff --git a/cmake/compile-flags.cmake b/cmake/compile-flags.cmake index 603943e..0e7ae55 100644 --- a/cmake/compile-flags.cmake +++ b/cmake/compile-flags.cmake @@ -40,6 +40,7 @@ if (BUILDING_SDK) -Wno-error=unused-but-set-variable -Wno-error=unused-variable -Wno-error=deprecated-declarations + -Wno-multichar -Wextra -Werror=frame-larger-than=32768 -Wno-unused-parameter diff --git a/lds/kendryte.ld b/lds/kendryte.ld index 38e9ede..94ae150 100644 --- a/lds/kendryte.ld +++ b/lds/kendryte.ld @@ -103,7 +103,7 @@ SECTIONS { PROVIDE_HIDDEN (__init_array_start = .); KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))) - KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)) + *(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors) PROVIDE_HIDDEN (__init_array_end = .); } >ram AT>ram :ram_ro diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index c2f8a81..c2a2e97 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -5,8 +5,8 @@ FILE(GLOB_RECURSE LIB_SRC "${CMAKE_CURRENT_LIST_DIR}/*.h" "${CMAKE_CURRENT_LIST_DIR}/*.hpp" - "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/*.c" + "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/*.s" "${CMAKE_CURRENT_LIST_DIR}/*.S" ) @@ -16,7 +16,8 @@ FILE(GLOB_RECURSE ASSEMBLY_FILES "${CMAKE_CURRENT_LIST_DIR}/*.S" ) -include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include) +include_directories(${SDK_ROOT}/third_party/xtl/include) +include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include) SET_PROPERTY(SOURCE ${ASSEMBLY_FILES} PROPERTY LANGUAGE C) SET_SOURCE_FILES_PROPERTIES(${ASSEMBLY_FILES} PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp -D __riscv64") diff --git a/lib/drivers/include/kpu.h b/lib/drivers/include/kpu.h index 7cef243..112f279 100644 --- a/lib/drivers/include/kpu.h +++ b/lib/drivers/include/kpu.h @@ -663,18 +663,31 @@ typedef void (*kpu_done_callback_t)(void *userdata); typedef struct { - const uint8_t *model_buffer; - uint8_t *main_buffer; - uint32_t output_count; - const kpu_model_output_t *outputs; - const kpu_model_layer_header_t *layer_headers; - const uint8_t *body_start; - uint32_t layers_length; - volatile uint32_t current_layer; - const uint8_t *volatile current_body; - dmac_channel_number_t dma_ch; - kpu_done_callback_t done_callback; - void *userdata; + int is_nncase; + + union + { + struct + { + const uint8_t *model_buffer; + uint8_t *main_buffer; + uint32_t output_count; + const kpu_model_output_t *outputs; + const kpu_model_layer_header_t *layer_headers; + const uint8_t *body_start; + uint32_t layers_length; + volatile uint32_t current_layer; + const uint8_t *volatile current_body; + dmac_channel_number_t dma_ch; + kpu_done_callback_t done_callback; + void *userdata; + }; + + struct + { + void* nncase_ctx; + }; + }; } kpu_model_context_t; typedef struct diff --git a/lib/drivers/kpu.c b/lib/drivers/kpu.c index 9428fa6..07aab36 100644 --- a/lib/drivers/kpu.c +++ b/lib/drivers/kpu.c @@ -10,6 +10,7 @@ #include "dmac.h" #include "kpu.h" #include "printf.h" +#include "nncase.h" #define LAYER_BURST_SIZE 12 @@ -1361,6 +1362,7 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer) if(header->version == 3 && header->arch == 0) { + ctx->is_nncase = 0; ctx->model_buffer = buffer; ctx->output_count = header->output_count; ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t)); @@ -1370,6 +1372,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer) ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage); if(!ctx->main_buffer) return -1; + } else if(header->version == 'KMDL') + { + return nncase_load_kmodel(ctx, buffer); } else { return -1; @@ -1380,6 +1385,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer) int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size) { + if(ctx->is_nncase) + return nncase_get_output(ctx, index, data, size); + if(index >= ctx->output_count) return -1; @@ -1391,6 +1399,9 @@ int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, siz void kpu_model_free(kpu_model_context_t *ctx) { + if(ctx->is_nncase) + return nncase_model_free(ctx); + free(ctx->main_buffer); ctx->main_buffer = NULL; } @@ -1595,6 +1606,9 @@ static void ai_step_not_isr(void *userdata) int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata) { + if(ctx->is_nncase) + return nncase_run_kmodel(ctx, src, dma_ch, done_callback, userdata); + ctx->dma_ch = dma_ch; ctx->done_callback = done_callback; ctx->userdata = userdata; diff --git a/lib/nncase/.clang-format b/lib/nncase/.clang-format new file mode 100644 index 0000000..8a741ac --- /dev/null +++ b/lib/nncase/.clang-format @@ -0,0 +1,8 @@ +--- +BasedOnStyle: WebKit +BreakBeforeBraces: Allman +ConstructorInitializerAllOnOneLineOrOnePerLine: 'true' +UseTab: Never +PointerAlignment: Right + +... diff --git a/lib/nncase/include/datatypes.h b/lib/nncase/include/datatypes.h new file mode 100644 index 0000000..d4212d3 --- /dev/null +++ b/lib/nncase/include/datatypes.h @@ -0,0 +1,97 @@ +#pragma once +#include +#include +#include + +namespace nncase +{ +typedef enum _datatype +{ + dt_float32, + dt_uint8 +} datatype_t; + +struct padding +{ + int32_t before; + int32_t after; + + int32_t sum() const noexcept { return before + after; } + + static padding zero() noexcept { return {}; } +}; + +template +struct value_range +{ + T min; + T max; +}; + +typedef enum _reduce_op +{ + reduce_mean, + reduce_min, + reduce_max +} reduce_op_t; + +typedef enum _binary_op +{ + binary_add, + binary_sub, + binary_mul, + binary_div +} binary_op_t; + +typedef struct _quant_param +{ + int32_t zero_point; + float scale; +} quant_param_t; + +inline bool operator==(const quant_param_t &lhs, const quant_param_t &rhs) noexcept +{ + return lhs.zero_point == rhs.zero_point && lhs.scale == rhs.scale; +} + +struct fixed_mul +{ + float mul; + int8_t shift; +}; + +typedef enum _memory_type +{ + mem_const, + mem_main, + mem_k210_kpu +} memory_type_t; + +using runtime_shape_t = std::array; +using runtime_paddings_t = std::array; + +struct scalar +{ + datatype_t type; + std::array storage; + + scalar() = default; + + template + scalar(T &&value) { as() = value; } + + template + T &as() noexcept { return *reinterpret_cast(storage.data()); } + + template + const T &as() const noexcept { return *reinterpret_cast(storage.data()); } +}; + +struct memory_range +{ + memory_type_t memory_type; + datatype_t datatype; + uint32_t start; + uint32_t size; +}; +} diff --git a/lib/nncase/include/kernels/cpu/cpu_kernels.h b/lib/nncase/include/kernels/cpu/cpu_kernels.h new file mode 100644 index 0000000..c151890 --- /dev/null +++ b/lib/nncase/include/kernels/cpu/cpu_kernels.h @@ -0,0 +1,257 @@ +#pragma once +#include "../utils.h" +#include + +namespace nncase +{ +namespace kernels +{ + namespace cpu + { + inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape, + int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, + const padding &padding_h, const padding &padding_w, const value_range &fused_activation) + { + const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h); + const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w); + + for (int batch = 0; batch < in_shape[0]; batch++) + { + auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int oy = 0; oy < out_h; oy++) + { + for (int ox = 0; ox < out_w; ox++) + { + int in_y_origin = (oy * stride_h) - padding_h.before; + int in_x_origin = (ox * stride_w) - padding_w.before; + int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h); + int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h); + int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w); + int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w); + + for (int oc = 0; oc < out_channels; oc++) + { + auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3]; + float value = bias[oc]; + + for (int ky = filter_y_start; ky < filter_y_end; ky++) + { + for (int kx = filter_xSstart; kx < filter_x_end; kx++) + { + int in_y = in_y_origin + dilation_h * ky; + int in_x = in_x_origin + dilation_w * kx; + + auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3]; + auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3]; + + for (int ic = 0; ic < in_shape[3]; ic++) + value += in_pix[ic] * w_pix[ic]; + } + } + + *output++ = details::apply_activation(value, fused_activation); + } + } + } + } + } + + inline void depthwise_conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape, + int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, + const padding &padding_h, const padding &padding_w, const value_range &fused_activation) + { + const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h); + const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w); + + for (int batch = 0; batch < in_shape[0]; batch++) + { + auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int oy = 0; oy < out_h; oy++) + { + for (int ox = 0; ox < out_w; ox++) + { + int in_y_origin = (oy * stride_h) - padding_h.before; + int in_x_origin = (ox * stride_w) - padding_w.before; + int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h); + int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h); + int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w); + int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w); + + for (int oc = 0; oc < in_shape[3]; oc++) + { + auto w_oc = weights + (size_t)oc * filter_h * filter_w; + float value = bias[oc]; + + for (int ky = filter_y_start; ky < filter_y_end; ky++) + { + for (int kx = filter_xSstart; kx < filter_x_end; kx++) + { + int in_y = in_y_origin + dilation_h * ky; + int in_x = in_x_origin + dilation_w * kx; + + auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3]; + auto w_pix = w_oc + ((size_t)ky * filter_w + kx); + + value += in_pix[oc] * w_pix[0]; + } + } + + *output++ = details::apply_activation(value, fused_activation); + } + } + } + } + } + + template + void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, + int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, + const padding &padding_h, const padding &padding_w, const value_range &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op) + { + const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h); + const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w); + + for (int batch = 0; batch < in_shape[0]; batch++) + { + auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int oy = 0; oy < out_h; oy++) + { + for (int ox = 0; ox < out_w; ox++) + { + int in_y_origin = (oy * stride_h) - padding_h.before; + int in_x_origin = (ox * stride_w) - padding_w.before; + int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h); + int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h); + int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w); + int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w); + + for (int oc = 0; oc < in_shape[3]; oc++) + { + float value = init_value; + int32_t kernel_count = 0; + + for (int ky = filter_y_start; ky < filter_y_end; ky++) + { + for (int kx = filter_xSstart; kx < filter_x_end; kx++) + { + int in_y = in_y_origin + dilation_h * ky; + int in_x = in_x_origin + dilation_w * kx; + + auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3]; + + value = binary_op(value, in_pix[oc]); + kernel_count++; + } + } + + *output++ = details::apply_activation(window_op(value, kernel_count), fused_activation); + } + } + } + } + } + + inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape, + int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, + const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset) + { + const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h); + const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w); + + for (int batch = 0; batch < in_shape[0]; batch++) + { + auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int oy = 0; oy < out_h; oy++) + { + for (int ox = 0; ox < out_w; ox++) + { + int in_y_origin = (oy * stride_h) - padding_h.before; + int in_x_origin = (ox * stride_w) - padding_w.before; + int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h); + int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h); + int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w); + int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w); + + for (int oc = 0; oc < out_channels; oc++) + { + auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3]; + int32_t value = bias[oc]; + + for (int ky = filter_y_start; ky < filter_y_end; ky++) + { + for (int kx = filter_xSstart; kx < filter_x_end; kx++) + { + int in_y = in_y_origin + dilation_h * ky; + int in_x = in_x_origin + dilation_w * kx; + + auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3]; + auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3]; + + for (int ic = 0; ic < in_shape[3]; ic++) + value += (in_pix[ic] - input_offset) * (w_pix[ic] - filter_offset); + } + } + + value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset; + *output++ = (uint8_t)std::clamp(value, 0, 255); + } + } + } + } + } + + inline void quantized_depthwise_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape, + int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, + const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset) + { + const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h); + const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w); + + for (int batch = 0; batch < in_shape[0]; batch++) + { + auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int oy = 0; oy < out_h; oy++) + { + for (int ox = 0; ox < out_w; ox++) + { + int in_y_origin = (oy * stride_h) - padding_h.before; + int in_x_origin = (ox * stride_w) - padding_w.before; + int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h); + int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h); + int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w); + int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w); + + for (int oc = 0; oc < in_shape[3]; oc++) + { + auto w_oc = weights + (size_t)oc * filter_h * filter_w; + int32_t value = bias[oc]; + + for (int ky = filter_y_start; ky < filter_y_end; ky++) + { + for (int kx = filter_xSstart; kx < filter_x_end; kx++) + { + int in_y = in_y_origin + dilation_h * ky; + int in_x = in_x_origin + dilation_w * kx; + + auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3]; + auto w_pix = w_oc + ((size_t)ky * filter_w + kx); + + value += (in_pix[oc] - input_offset) * (w_pix[0] - filter_offset); + } + } + + value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset; + *output++ = (uint8_t)std::clamp(value, 0, 255); + } + } + } + } + } + } +} +} diff --git a/lib/nncase/include/kernels/k210/k210_kernels.h b/lib/nncase/include/kernels/k210/k210_kernels.h new file mode 100644 index 0000000..2782b6f --- /dev/null +++ b/lib/nncase/include/kernels/k210/k210_kernels.h @@ -0,0 +1,256 @@ +#pragma once +#include "../utils.h" +#include +#include + +namespace nncase +{ +namespace kernels +{ + namespace k210 + { + inline void kpu_upload(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape) + { + if (in_shape[3] % 64 == 0) + { + std::copy(src, src + kernels::details::compute_size(in_shape), dest); + } + else + { + auto layout = targets::k210::get_kpu_row_layout(in_shape[3]); + auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]); + + for (int32_t batch = 0; batch < in_shape[0]; batch++) + { + auto batch_origin = dest + (size_t)batch * fmap_size; + for (int32_t oc = 0; oc < in_shape[1]; oc++) + { + auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch; + for (int32_t y = 0; y < in_shape[2]; y++) + { + auto y_origin = channel_origin + (size_t)y * layout.row_len * 64; + std::copy(src, src + in_shape[3], y_origin); + src += in_shape[3]; + } + } + } + } + } + +#if NNCASE_TARGET_K210_SIMULATOR + + inline void kpu_download(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape) + { + if (in_shape[3] % 64 == 0) + { + std::copy(src, src + kernels::details::compute_size(in_shape), dest); + } + else + { + auto layout = targets::k210::get_kpu_row_layout(in_shape[3]); + auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]); + + for (int32_t batch = 0; batch < in_shape[0]; batch++) + { + auto batch_origin = src + (size_t)batch * fmap_size; + for (int32_t oc = 0; oc < in_shape[1]; oc++) + { + auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch; + for (int32_t y = 0; y < in_shape[2]; y++) + { + auto y_origin = channel_origin + (size_t)y * layout.row_len * 64; + for (int32_t x = 0; x < in_shape[3]; x++) + *dest++ = y_origin[x]; + } + } + } + } + } + + template + void kpu_conv2d(const uint8_t *input, int64_t *workspace, uint8_t *output, const uint8_t *weights, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, uint8_t pad_value, int32_t arg_x, + int32_t shift_x, int32_t arg_w, int32_t shift_w, int64_t arg_add, const targets::k210::kpu_batchnorm_segment *batchnorm, const targets::k210::kpu_activation_table_t &activation) + { + const auto channel_size = size_t(in_h) * in_w; + // conv + { + auto out_it = workspace; + const auto pad = FilterSize == 1 ? 0 : 1; + const auto groups = IsDepthwise ? out_channels : 1; + const auto g_ic = IsDepthwise ? 1 : in_channels / groups; + const auto g_oc = IsDepthwise ? 1 : out_channels; + + for (int32_t og = 0; og < groups; og++) + { + const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize; + + for (int32_t oc = 0; oc < g_oc; oc++) + { + const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize; + + for (int32_t oy = 0; oy < in_h; oy++) + { + for (int32_t ox = 0; ox < in_w; ox++) + { + const int32_t in_y_origin = oy - pad; + const int32_t in_x_origin = ox - pad; + int64_t value = 0; + int64_t sum_x = 0, sum_w = 0; + + for (int32_t ic = 0; ic < g_ic; ic++) + { + const uint8_t *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w; + const uint8_t *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize; + + for (int32_t ky = 0; ky < FilterSize; ky++) + { + for (int32_t kx = 0; kx < FilterSize; kx++) + { + const int32_t in_y = in_y_origin + ky; + const int32_t in_x = in_x_origin + kx; + + uint8_t x; + if (in_x < 0 || in_x >= in_w + || in_y < 0 || in_y >= in_h) + x = pad_value; + else + x = in_c_p[in_y * in_w + in_x]; + + uint8_t w = w_ic_p[ky * FilterSize + kx]; + + sum_x += x; + sum_w += w; + value += (int32_t)x * w; + } + } + } + + *out_it++ = value + (arg_x * sum_x >> shift_x) + (arg_w * sum_w >> shift_w) + arg_add * g_ic; + } + } + } + } + } + + // bn act + { + auto src_it = workspace; + auto out_it = output; + for (int32_t oc = 0; oc < out_channels; oc++) + { + const auto &bn = batchnorm[oc]; + for (size_t i = 0; i < channel_size; i++) + { + auto value = (*src_it++ * bn.mul >> bn.shift) + bn.add; + auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const targets::k210::kpu_activation_segment &seg) { + return value > seg.start_x; + }); + value = runtime::carry_shift((value - seg.start_x) * seg.mul, seg.shift); + *out_it++ = (uint8_t)std::clamp(value, int64_t(0), int64_t(255)); + } + } + } + } + + inline void kpu_pool2d(const uint8_t *input, uint8_t *output, int32_t in_h, int32_t in_w, int32_t in_channels, targets::k210::kpu_pool_type_t pool_type) + { + using namespace targets::k210; + + const auto filter = get_kpu_filter_size(pool_type); + const auto stride = get_kpu_filter_stride(pool_type); + const auto out_h = get_kpu_pool_output_size(in_h, pool_type); + const auto out_w = get_kpu_pool_output_size(in_w, pool_type); + + for (int32_t oc = 0; oc < in_channels; oc++) + { + auto in_c_p = input + (size_t)oc * in_h * in_w; + + for (int32_t oy = 0; oy < out_h; oy++) + { + for (int32_t ox = 0; ox < out_w; ox++) + { + const int32_t in_y_origin = oy * stride; + const int32_t in_x_origin = ox * stride; + int32_t value = 0; + + switch (pool_type) + { + case kpu_pool_bypass: + { + const int32_t in_y = in_y_origin; + const int32_t in_x = in_x_origin; + + value = in_c_p[in_y * in_w + in_x]; + break; + } + case kpu_pool_max_2_s2: + case kpu_pool_max_2_s1: + case kpu_pool_max_4_s4: + { + for (int32_t ky = 0; ky < filter; ky++) + { + for (int32_t kx = 0; kx < filter; kx++) + { + const int32_t in_y = in_y_origin + ky; + const int32_t in_x = in_x_origin + kx; + int32_t in_v; + + if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w) + in_v = 0; + else + in_v = in_c_p[in_y * in_w + in_x]; + + value = std::max(value, in_v); + } + } + + break; + } + case kpu_pool_mean_2_s2: + case kpu_pool_mean_2_s1: + case kpu_pool_mean_4_s4: + { + for (int32_t ky = 0; ky < filter; ky++) + { + for (int32_t kx = 0; kx < filter; kx++) + { + const int32_t in_y = std::clamp(in_y_origin + ky, 0, in_h - 1); + const int32_t in_x = std::clamp(in_x_origin + kx, 0, in_w - 1); + const int32_t in_v = in_c_p[in_y * in_w + in_x]; + + value += in_v; + } + } + + value /= filter * filter; + break; + } + case kpu_pool_left_top_2_s2: + case kpu_pool_left_top_4_s4: + case kpu_pool_right_top_2_s2: + { + auto k_off = get_kpu_select_pool_offset(pool_type); + const int32_t in_y = in_y_origin + k_off[0]; + const int32_t in_x = in_x_origin + k_off[1]; + int32_t in_v; + + if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w) + in_v = 0; + else + in_v = in_c_p[in_y * in_w + in_x]; + + value = in_v; + break; + } + } + + *output++ = (uint8_t)value; + } + } + } + } + +#endif + } +} +} diff --git a/lib/nncase/include/kernels/neutral/neutral_kernels.h b/lib/nncase/include/kernels/neutral/neutral_kernels.h new file mode 100644 index 0000000..c4d372e --- /dev/null +++ b/lib/nncase/include/kernels/neutral/neutral_kernels.h @@ -0,0 +1,422 @@ +#pragma once +#include "../utils.h" +#include +#include +#include + +namespace nncase +{ +namespace kernels +{ + namespace neutral + { + template + void binary(const float *input_a, const float *input_b, float *output, const runtime_shape_t &in_a_shape, + const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, const value_range &fused_activation, TOp &&op) + { + for (int32_t d0 = 0; d0 < out_shape[0]; d0++) + { + for (int32_t d1 = 0; d1 < out_shape[1]; d1++) + { + for (int32_t d2 = 0; d2 < out_shape[2]; d2++) + { + for (int32_t d3 = 0; d3 < out_shape[3]; d3++) + { + runtime_shape_t in_off = { d0, d1, d2, d3 }; + const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape); + const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape); + const auto a = input_a[offset(in_a_shape, in_a_off)]; + const auto b = input_b[offset(in_b_shape, in_b_off)]; + + output[offset(out_shape, in_off)] = kernels::details::apply_activation(op(a, b), fused_activation); + } + } + } + } + } + + template > + inline void concat(xtl::span inputs, uint8_t *output, xtl::span concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {}) + { + for (size_t oc = 0; oc < outer_size; oc++) + { + for (size_t i = 0; i < inputs.size(); i++) + { + auto size = inner_size * concat_dims[i]; + auto src = getter(inputs[i]) + oc * size; + std::copy(src, src + size, output); + output += size; + } + } + } + + inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape, + int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, + const padding &padding_h, const padding &padding_w, const value_range &fused_activation) + { + const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h); + const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w); + const auto g_ic = in_shape[1] / groups; + const auto g_oc = out_channels / groups; + + for (int32_t batch = 0; batch < in_shape[0]; batch++) + { + const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int32_t og = 0; og < groups; og++) + { + const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3]; + const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w; + + for (int32_t oc = 0; oc < g_oc; oc++) + { + const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w; + + for (int32_t oy = 0; oy < out_h; oy++) + { + for (int32_t ox = 0; ox < out_w; ox++) + { + const int32_t in_y_origin = (oy * stride_h) - padding_h.before; + const int32_t in_x_origin = (ox * stride_w) - padding_w.before; + const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h); + const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h); + const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w); + const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w); + float value = bias[oc]; + + for (int32_t ic = 0; ic < g_ic; ic++) + { + const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3]; + const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w; + + for (int32_t ky = filter_y_start; ky < filter_y_end; ky++) + { + for (int32_t kx = filter_x_start; kx < filter_x_end; kx++) + { + const int32_t in_y = in_y_origin + dilation_h * ky; + const int32_t in_x = in_x_origin + dilation_w * kx; + + const float in_v = in_c_p[in_y * in_shape[3] + in_x]; + const float w = w_ic_p[ky * filter_w + kx]; + + value += in_v * w; + } + } + } + + *output++ = details::apply_activation(value, fused_activation); + } + } + } + } + } + } + + template + void dequantize(const TQ *input, float *output, size_t count, const quant_param_t ¶m) + { + float div = 1.f / param.scale; + + for (size_t i = 0; i < count; i++) + { + output[i] = (input[i] - param.zero_point) * div; + } + } + + inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range &fused_activation) + { + for (size_t oy = 0; oy < a_rows; oy++) + { + for (size_t ox = 0; ox < b_cols; ox++) + { + float value = bias[ox]; + for (size_t i = 0; i < a_cols; i++) + { + const auto a = input_a[oy * a_cols + i]; + const auto b = input_b[i * b_cols + ox]; + value += a * b; + } + + output[oy * b_cols + ox] = details::apply_activation(value, fused_activation); + } + } + } + + template + void pad(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_paddings_t &paddings, T pad_value) + { + runtime_shape_t out_shape = { in_shape[0] + paddings[0].sum(), + in_shape[1] + paddings[1].sum(), + in_shape[2] + paddings[2].sum(), + in_shape[3] + paddings[3].sum() }; + + for (int d0 = 0; d0 < out_shape[0]; d0++) + { + auto d0_origin = -paddings[0].before; + auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int d1 = 0; d1 < out_shape[1]; d1++) + { + auto d1_origin = -paddings[1].before; + auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3]; + + for (int d2 = 0; d2 < out_shape[2]; d2++) + { + auto d2_origin = -paddings[2].before; + auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3]; + + for (int d3 = 0; d3 < out_shape[3]; d3++) + { + auto d3_origin = -paddings[3].before; + + if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after + || d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after + || d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after + || d3 < paddings[3].before || d1 >= out_shape[3] - paddings[3].after) + *output++ = pad_value; + else + *output++ = in2[d3_origin + d3]; + } + } + } + } + } + + template + void quantize(const float *input, TQ *output, size_t count, const quant_param_t ¶m) + { + for (size_t i = 0; i < count; i++) + { + int32_t tmp = (int32_t)roundf(input[i] * param.scale + param.zero_point); + output[i] = std::clamp(tmp, (int32_t)std::numeric_limits::lowest(), (int32_t)std::numeric_limits::max()); + } + } + + template + void reduce(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, const runtime_shape_t &reduced_shape, TReducer &&reducer) + { + std::fill(output, output + kernels::details::compute_size(reduced_shape), init_value); + + for (int32_t d0 = 0; d0 < in_shape[0]; d0++) + { + for (int32_t d1 = 0; d1 < in_shape[1]; d1++) + { + for (int32_t d2 = 0; d2 < in_shape[2]; d2++) + { + for (int32_t d3 = 0; d3 < in_shape[3]; d3++) + { + runtime_shape_t in_off = { d0, d1, d2, d3 }; + auto out_off = kernels::details::get_reduced_offset(in_off, reduced_shape); + const auto a = input[offset(in_shape, in_off)]; + auto &b = output[offset(reduced_shape, out_off)]; + b = reducer(b, a); + } + } + } + } + } + + template + void unary(const float *input, float *output, size_t count, TOp &&op) + { + for (size_t i = 0; i < count; i++) + output[i] = op(input[i]); + } + + template + void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, int32_t filter_h, int32_t filter_w, + int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w, + const value_range &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op) + { + const auto out_h = kernels::details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h); + const auto out_w = kernels::details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w); + runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w }; + + for (int32_t batch = 0; batch < in_shape[0]; batch++) + { + for (int32_t oc = 0; oc < in_shape[1]; oc++) + { + for (int32_t oy = 0; oy < out_h; oy++) + { + for (int32_t ox = 0; ox < out_w; ox++) + { + const int32_t in_y_origin = (oy * stride_h) - padding_h.before; + const int32_t in_x_origin = (ox * stride_w) - padding_w.before; + const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h); + const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h); + const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w); + const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w); + float value = init_value; + int32_t kernel_count = 0; + + for (int32_t ky = filter_y_start; ky < filter_y_end; ky++) + { + for (int32_t kx = filter_x_start; kx < filter_x_end; kx++) + { + const int32_t in_y = in_y_origin + dilation_h * ky; + const int32_t in_x = in_x_origin + dilation_w * kx; + + const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })]; + + value = binary_op(value, in_v); + kernel_count++; + } + } + + output[offset(out_shape, { batch, oc, oy, ox })] = kernels::details::apply_activation(window_op(value, kernel_count), fused_activation); + } + } + } + } + } + + template + void resize_nearest_neighbor(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w) + { + auto height_scale = (float)in_shape[2] / out_h; + auto width_scale = (float)in_shape[3] / out_w; + + for (int batch = 0; batch < in_shape[0]; batch++) + { + auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int oc = 0; oc < in_shape[1]; oc++) + { + auto in_c = in_batch + oc * in_shape[2] * in_shape[3]; + + for (int oy = 0; oy < out_h; oy++) + { + auto in_y = std::min((int32_t)floorf(oy * height_scale), in_shape[2] - 1); + auto in_row = in_c + in_y * in_shape[3]; + + for (int ox = 0; ox < out_w; ox++) + { + auto in_x = std::min((int32_t)floorf(ox * width_scale), in_shape[3] - 1); + *output++ = in_row[in_x]; + } + } + } + } + } + + inline void resize_bilinear(const float *input, float *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners) + { + auto height_scale = (float)in_shape[2] / out_h; + auto width_scale = (float)in_shape[3] / out_w; + if (align_corners && out_h > 1) + height_scale = (float)(in_shape[2] - 1) / (out_h - 1); + if (align_corners && out_w > 1) + width_scale = (float)(in_shape[3] - 1) / (out_w - 1); + + auto destIdx = 0; + for (int batch = 0; batch < in_shape[0]; batch++) + { + auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3]; + + for (int oc = 0; oc < in_shape[1]; oc++) + { + auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3]; + + for (int oy = 0; oy < out_h; oy++) + { + auto in_y = oy * height_scale; + auto in_y0 = (int)floorf(in_y); + auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1); + + for (int ox = 0; ox < out_w; ox++) + { + auto in_x = ox * width_scale; + auto in_x0 = (int)floorf(in_x); + auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1); + + auto v0 = in_c[in_y0 * in_shape[3] + in_x0]; + auto v1 = in_c[in_y1 * in_shape[3] + in_x0]; + auto v2 = in_c[in_y0 * in_shape[3] + in_x1]; + auto v3 = in_c[in_y1 * in_shape[3] + in_x1]; + + auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0)); + auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0)); + auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0); + auto a3 = (in_y - in_y0) * (in_x - in_x0); + + output[destIdx++] = v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3; + } + } + } + } + } + + inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size) + { + for (size_t batch = 0; batch < outer_size; batch++) + { + auto src = input + batch * inner_size; + auto dest = output + batch * inner_size; + + auto max = *std::max_element(src, src + inner_size); + float sum = 0; + + for (size_t i = 0; i < inner_size; i++) + { + auto value = expf((src[i] - max) * beta); + sum += value; + dest[i] = value; + } + + for (size_t i = 0; i < inner_size; i++) + dest[i] /= sum; + } + } + + template + void transpose(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &perm) + { + runtime_shape_t out_shape; + for (size_t i = 0; i < 4; i++) + out_shape[i] = in_shape[perm[i]]; + + runtime_shape_t i, o; + for (o[3] = 0; o[3] < out_shape[3]; o[3]++) + { + i[perm[3]] = o[3]; + for (o[2] = 0; o[2] < out_shape[2]; o[2]++) + { + i[perm[2]] = o[2]; + for (o[1] = 0; o[1] < out_shape[1]; o[1]++) + { + i[perm[1]] = o[1]; + for (o[0] = 0; o[0] < out_shape[0]; o[0]++) + { + i[perm[0]] = o[0]; + output[offset(out_shape, o)] = input[offset(in_shape, i)]; + } + } + } + } + } + + template + void strided_slice(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &begin, const runtime_shape_t &end, const runtime_shape_t &strides) + { + auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) { + return stride > 0 ? i < stop : i > stop; + }; + + for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0]) + { + auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3]; + for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1]) + { + auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3]; + for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2]) + { + auto d2_origin = d1_origin + (size_t)d2 * in_shape[3]; + for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3]) + *output++ = d2_origin[d3]; + } + } + } + } + } +} +} diff --git a/lib/nncase/include/kernels/utils.h b/lib/nncase/include/kernels/utils.h new file mode 100644 index 0000000..62d717f --- /dev/null +++ b/lib/nncase/include/kernels/utils.h @@ -0,0 +1,82 @@ +#pragma once +#include +#include +#include +#include + +namespace nncase +{ +namespace kernels +{ + inline size_t offset(const runtime_shape_t &shape, const runtime_shape_t &index) + { + return (((size_t)index[0] * shape[1] + index[1]) * shape[2] + index[2]) * shape[3] + index[3]; + } + + namespace details + { + inline int32_t get_windowed_output_size(int32_t size, int32_t filter, int32_t stride, int32_t dilation, const padding &padding) + { + auto effective_filter_size = (filter - 1) * dilation + 1; + return (size + padding.before + padding.after - effective_filter_size + stride) / stride; + } + + inline size_t compute_size(const runtime_shape_t &shape) + { + return size_t(shape[0]) * shape[1] * shape[2] * shape[3]; + } + + template + inline T apply_activation(T value, value_range activation) + { + return std::clamp(value, activation.min, activation.max); + } + + inline runtime_shape_t get_reduced_offset(const runtime_shape_t &in_offset, const runtime_shape_t &reduced_shape) + { + runtime_shape_t off; + for (size_t i = 0; i < in_offset.size(); i++) + { + if (in_offset[i] >= reduced_shape[i]) + off[i] = 0; + else + off[i] = in_offset[i]; + } + + return off; + } + + template + struct default_ptr_getter + { + T *operator()(const TRange &range) const noexcept { return range; } + }; + + template + int32_t to_signed(uint32_t value) + { + auto mask = uint32_t(1) << (Bits - 1); + if (Bits != 32 && (value & mask) != 0) + { + auto sign = 0xFFFFFFFF << Bits; + return (int)(value | sign); + } + + return (int32_t)value; + } + + template + int64_t to_signed(uint64_t value) + { + auto mask = uint64_t(1) << (Bits - 1); + if ((value & mask) != 0) + { + auto sign = 0xFFFFFFFFFFFFFFFF << Bits; + return (int64_t)(value | sign); + } + + return (int64_t)value; + } + } +} +} diff --git a/lib/nncase/include/nncase.h b/lib/nncase/include/nncase.h new file mode 100644 index 0000000..28666e1 --- /dev/null +++ b/lib/nncase/include/nncase.h @@ -0,0 +1,33 @@ +/* Copyright 2018 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _NNCASE_H +#define _NNCASE_H + +#include "kpu.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer); +int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size); +void nncase_model_free(kpu_model_context_t *ctx); +int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/lib/nncase/include/runtime/binary_writer.h b/lib/nncase/include/runtime/binary_writer.h new file mode 100644 index 0000000..1af2d8f --- /dev/null +++ b/lib/nncase/include/runtime/binary_writer.h @@ -0,0 +1,51 @@ +#pragma once +#include +#include + +namespace nncase +{ +namespace runtime +{ + class binary_writer + { + public: + binary_writer(std::ostream &stream) + : stream_(stream) + { + } + + template + void write(T &&value) + { + stream_.write(reinterpret_cast(&value), sizeof(value)); + } + + template + void write_array(xtl::span value) + { + stream_.write(reinterpret_cast(value.data()), value.size_bytes()); + } + + std::streampos position() const + { + return stream_.tellp(); + } + + void position(std::streampos pos) + { + stream_.seekp(pos); + } + + void align_position(size_t alignment) + { + auto pos = position(); + auto rem = pos % alignment; + if (rem != 0) + position(pos + std::streamoff(alignment - rem)); + } + + private: + std::ostream &stream_; + }; +} +} diff --git a/lib/nncase/include/runtime/interpreter.h b/lib/nncase/include/runtime/interpreter.h new file mode 100644 index 0000000..20e827b --- /dev/null +++ b/lib/nncase/include/runtime/interpreter.h @@ -0,0 +1,71 @@ +#pragma once +#include "model.h" +#include +#include +#include +#include + +namespace nncase +{ +namespace runtime +{ + class interpreter_base; + typedef void (*run_callback_t)(void *userdata); + typedef void (*error_callback_t)(const char *err, void *userdata); + typedef void (*node_profile_callback_t)(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata); + typedef void (interpreter_base::*interpreter_step_t)(); + + class interpreter_base + { + using clock_t = std::chrono::system_clock; + + public: + bool try_load_model(const uint8_t *buffer); + + size_t inputs_size() const noexcept { return model_header_->inputs; } + size_t outputs_size() const noexcept { return model_header_->outputs; } + size_t nodes_size() const noexcept { return model_header_->nodes; } + + const runtime_shape_t &input_shape_at(size_t index) const noexcept { return input_shapes_.at(index); } + const memory_range &input_at(size_t index) const noexcept { return inputs_[index]; } + const memory_range &output_at(size_t index) const noexcept { return outputs_[index]; } + + template + xtl::span memory_at(const memory_range &range) const noexcept + { + auto span = memory_at(range); + return { reinterpret_cast(span.data()), span.size() / sizeof(T) }; + } + + std::chrono::nanoseconds total_duration() const noexcept { return total_duration_; } + + void run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata); + + protected: + virtual bool initialize(); + virtual xtl::span memory_at(const memory_range &range) const noexcept; + + private: + void step(); + + private: + const model_header *model_header_; + std::unique_ptr main_mem_; + xtl::span inputs_; + xtl::span outputs_; + xtl::span input_shapes_; + xtl::span node_headers_; + xtl::span constants_; + const uint8_t *node_body_start_; + error_callback_t on_error_; + run_callback_t run_callback_; + node_profile_callback_t node_profile_; + void *userdata_; + size_t cnt_node_; + const uint8_t *cnt_node_body_; + std::chrono::nanoseconds total_duration_; + std::optional last_time_; + runtime_opcode last_op_; + }; +} +} diff --git a/lib/nncase/include/runtime/kernel_registry.h b/lib/nncase/include/runtime/kernel_registry.h new file mode 100644 index 0000000..0dc1ddd --- /dev/null +++ b/lib/nncase/include/runtime/kernel_registry.h @@ -0,0 +1,20 @@ +#pragma once +#include "target_config.h" +#include +#include +#include + +namespace nncase +{ +namespace runtime +{ + enum kernel_call_result + { + kcr_done, + kcr_async, + kcr_error + }; + + kernel_call_result call_kernel(runtime_opcode opcode, xtl::span body, interpreter_t &interpreter, interpreter_step_t step); +} +} diff --git a/lib/nncase/include/runtime/model.h b/lib/nncase/include/runtime/model.h new file mode 100644 index 0000000..b597618 --- /dev/null +++ b/lib/nncase/include/runtime/model.h @@ -0,0 +1,38 @@ +#pragma once +#include "../datatypes.h" +#include "runtime_op.h" + +namespace nncase +{ +namespace runtime +{ + enum model_target : uint32_t + { + MODEL_TARGET_CPU = 0, + MODEL_TARGET_K210 = 1, + }; + + struct model_header + { + uint32_t identifier; + uint32_t version; + uint32_t flags; + model_target target; + uint32_t constants; + uint32_t main_mem; + uint32_t nodes; + uint32_t inputs; + uint32_t outputs; + uint32_t reserved0; + }; + + constexpr uint32_t MODEL_IDENTIFIER = 'KMDL'; + constexpr uint32_t MODEL_VERSION = 4; + + struct node_header + { + runtime_opcode opcode; + uint32_t body_size; + }; +} +} diff --git a/lib/nncase/include/runtime/runtime_op.def b/lib/nncase/include/runtime/runtime_op.def new file mode 100644 index 0000000..f18dd35 --- /dev/null +++ b/lib/nncase/include/runtime/runtime_op.def @@ -0,0 +1,32 @@ +BEGINE_DEFINE_TARGET(neutral) + DEFINE_RUNTIME_OP(neutral, binary, Binary, 0) + DEFINE_RUNTIME_OP(neutral, concat, Concat, 1) + DEFINE_RUNTIME_OP(neutral, conv2d, Conv2D, 2) + DEFINE_RUNTIME_OP(neutral, dequantize, Dequantize, 3) + DEFINE_RUNTIME_OP(neutral, matmul, MatMul, 4) + DEFINE_RUNTIME_OP(neutral, pad, Pad, 5) + DEFINE_RUNTIME_OP(neutral, quantize, Quantize, 6) + DEFINE_RUNTIME_OP(neutral, reduce, Reduce, 7) + DEFINE_RUNTIME_OP(neutral, reduce_window2d, ReduceWindow2D, 8) + DEFINE_RUNTIME_OP(neutral, memory_copy, MemoryCopy, 9) + DEFINE_RUNTIME_OP(neutral, resize_bilinear, ResizeBilinear, 10) + DEFINE_RUNTIME_OP(neutral, resize_nearest_neighbor, ResizeNearestNeighbor, 11) + DEFINE_RUNTIME_OP(neutral, softmax, Softmax, 12) + DEFINE_RUNTIME_OP(neutral, transpose, Transpose, 13) + DEFINE_RUNTIME_OP(neutral, strided_slice, StridedSlice, 14) +END_DEFINE_TARGET() + +// CPU +BEGINE_DEFINE_TARGET(cpu) + DEFINE_RUNTIME_OP(cpu, cpu_conv2d, CPU_CPUConv2D, 1001) + DEFINE_RUNTIME_OP(cpu, cpu_depthwise_conv2d, CPU_CPUDepthwiseConv2D, 1002) + DEFINE_RUNTIME_OP(cpu, cpu_reduce_window2d, CPU_CPUReduceWindow2D, 1003) + DEFINE_RUNTIME_OP(cpu, cpu_quantized_conv2d, CPU_CPUQuantizedConv2D, 1004) + DEFINE_RUNTIME_OP(cpu, cpu_quantized_depthwise_conv2d, CPU_CPUQuantizedDepthwiseConv2D, 1005) +END_DEFINE_TARGET() + +// K210 +BEGINE_DEFINE_TARGET(k210) + DEFINE_RUNTIME_OP(k210, kpu_upload, K210_KPUUpload, 2001) + DEFINE_RUNTIME_OP(k210, kpu_conv2d, K210_KPUConv2D, 2002) +END_DEFINE_TARGET() diff --git a/lib/nncase/include/runtime/runtime_op.h b/lib/nncase/include/runtime/runtime_op.h new file mode 100644 index 0000000..d927ba9 --- /dev/null +++ b/lib/nncase/include/runtime/runtime_op.h @@ -0,0 +1,37 @@ +#pragma once +#include "../datatypes.h" +#include + +namespace nncase +{ +namespace runtime +{ +#define BEGINE_DEFINE_TARGET(...) +#define DEFINE_RUNTIME_OP(target, id, name, value) rop_##id = value, +#define END_DEFINE_TARGET() + + enum runtime_opcode : uint32_t + { +#include "runtime_op.def" + }; + +#undef DEFINE_RUNTIME_OP +#define DEFINE_RUNTIME_OP(target, id, name, value) \ + case rop_##id: \ + return #name; + + constexpr std::string_view node_opcode_names(runtime_opcode opcode) + { + switch (opcode) + { +#include "runtime_op.def" + default: + return {}; + } + } + +#undef BEGINE_DEFINE_TARGET +#undef DEFINE_RUNTIME_OP +#undef END_DEFINE_TARGET +} +} diff --git a/lib/nncase/include/runtime/span_reader.h b/lib/nncase/include/runtime/span_reader.h new file mode 100644 index 0000000..c60d009 --- /dev/null +++ b/lib/nncase/include/runtime/span_reader.h @@ -0,0 +1,82 @@ +#pragma once +#include + +namespace nncase +{ +namespace runtime +{ + class span_reader + { + public: + span_reader(xtl::span span) + : span_(span) + { + } + + bool empty() const noexcept { return span_.empty(); } + + template + T read() + { + auto value = *reinterpret_cast(span_.data()); + advance(sizeof(T)); + return value; + } + + template + void read(T &value) + { + value = *reinterpret_cast(span_.data()); + advance(sizeof(T)); + } + + template + void read_span(xtl::span &span, size_t size) + { + span = { reinterpret_cast(span_.data()), size }; + advance(sizeof(T) * size); + } + + template + void read_span(xtl::span &span) + { + span = { reinterpret_cast(span_.data()), N }; + advance(sizeof(T) * N); + } + + template + const T *peek() const noexcept + { + return reinterpret_cast(span_.data()); + } + + template + void get_array(const T *&value, size_t size) + { + value = peek(); + advance(size * sizeof(T)); + } + + template + void get_ref(const T *&value) + { + value = peek(); + advance(sizeof(T)); + } + + void skip(size_t count) + { + advance(count); + } + + private: + void advance(size_t count) + { + span_ = span_.subspan(count); + } + + private: + xtl::span span_; + }; +} +} diff --git a/lib/nncase/include/runtime/target_config.h b/lib/nncase/include/runtime/target_config.h new file mode 100644 index 0000000..381e5de --- /dev/null +++ b/lib/nncase/include/runtime/target_config.h @@ -0,0 +1,15 @@ +#pragma once + +#define NNCASE_CONCAT_3(a, b, c) a/b/c +#define NNCASE_TARGET_HEADER_(target, name) +#define NNCASE_TARGET_HEADER(name) NNCASE_TARGET_HEADER_(NNCASE_TARGET, name) + +#include NNCASE_TARGET_HEADER(interpreter.h) + +namespace nncase +{ +namespace runtime +{ + using interpreter_t = nncase::targets::NNCASE_TARGET::interpreter; +} +} diff --git a/lib/nncase/include/runtime_op_utility.h b/lib/nncase/include/runtime_op_utility.h new file mode 100644 index 0000000..78fab1c --- /dev/null +++ b/lib/nncase/include/runtime_op_utility.h @@ -0,0 +1,70 @@ +#pragma once +#include +#include + +namespace nncase +{ +namespace runtime +{ + inline size_t get_bytes(datatype_t type) + { + size_t element_size; + + switch (type) + { + case dt_float32: + element_size = 4; + break; + case dt_uint8: + element_size = 1; + break; + default: + assert(!"Not supported data type"); + } + + return element_size; + } + + template + uint8_t count_leading_zeros(T value) + { + uint8_t num_zeroes = 0; + for (int32_t i = Bits - 1; i >= 0; i--) + { + if ((value & (1ULL << i)) == 0) + ++num_zeroes; + else + break; + } + + return num_zeroes; + } + + template + T carry_shift(T value, uint8_t shift) + { + if (shift > 0) + { + value >>= shift - 1; + if (value & 0x1) + { + if (value < 0) + value = (value >> 1) - 1; + else + value = (value >> 1) + 1; + } + else + { + value >>= 1; + } + } + + return value; + } + + inline int32_t mul_and_carry_shift(int32_t value, int32_t mul, uint8_t shift) + { + return (int32_t)carry_shift((int64_t) value * mul, shift); + } +} +} diff --git a/lib/nncase/include/targets/cpu/cpu_ops_body.h b/lib/nncase/include/targets/cpu/cpu_ops_body.h new file mode 100644 index 0000000..afdb8e1 --- /dev/null +++ b/lib/nncase/include/targets/cpu/cpu_ops_body.h @@ -0,0 +1,193 @@ +#pragma once +#include "../node_body.h" + +namespace nncase +{ +namespace targets +{ + namespace cpu + { + struct cpu_conv2d_options + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + int32_t out_channels; + padding padding_h; + padding padding_w; + int32_t filter_h; + int32_t filter_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + value_range fused_activation; + xtl::span weights; + xtl::span bias; + + void deserialize(runtime::span_reader &reader) + { + reader.read(input); + reader.read(output); + reader.read(in_shape); + reader.read(out_channels); + reader.read(padding_h); + reader.read(padding_w); + reader.read(filter_h); + reader.read(filter_w); + reader.read(stride_h); + reader.read(stride_w); + reader.read(dilation_h); + reader.read(dilation_w); + reader.read(fused_activation); + reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w); + reader.read_span(bias, out_channels); + } + }; + + struct cpu_depthwise_conv2d_options + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + padding padding_h; + padding padding_w; + int32_t filter_h; + int32_t filter_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + value_range fused_activation; + xtl::span weights; + xtl::span bias; + + void deserialize(runtime::span_reader &reader) + { + reader.read(input); + reader.read(output); + reader.read(in_shape); + reader.read(padding_h); + reader.read(padding_w); + reader.read(filter_h); + reader.read(filter_w); + reader.read(stride_h); + reader.read(stride_w); + reader.read(dilation_h); + reader.read(dilation_w); + reader.read(fused_activation); + reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w); + reader.read_span(bias, in_shape[3]); + } + }; + + struct cpu_reduce_window2d_options : simple_node_body + { + memory_range input; + memory_range output; + reduce_op_t reduce_op; + runtime_shape_t in_shape; + padding padding_h; + padding padding_w; + int32_t filter_h; + int32_t filter_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + float init_value; + value_range fused_activation; + }; + + struct cpu_quantized_conv2d_options + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + int32_t out_channels; + padding padding_h; + padding padding_w; + int32_t filter_h; + int32_t filter_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + int32_t input_offset; + int32_t filter_offset; + int32_t output_mul; + int32_t output_shift; + int32_t output_offset; + xtl::span weights; + xtl::span bias; + + void deserialize(runtime::span_reader &reader) + { + reader.read(input); + reader.read(output); + reader.read(in_shape); + reader.read(out_channels); + reader.read(padding_h); + reader.read(padding_w); + reader.read(filter_h); + reader.read(filter_w); + reader.read(stride_h); + reader.read(stride_w); + reader.read(dilation_h); + reader.read(dilation_w); + reader.read(input_offset); + reader.read(filter_offset); + reader.read(output_mul); + reader.read(output_shift); + reader.read(output_offset); + reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w); + reader.read_span(bias, out_channels); + } + }; + + struct cpu_quantized_depthwise_conv2d_options + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + padding padding_h; + padding padding_w; + int32_t filter_h; + int32_t filter_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + int32_t input_offset; + int32_t filter_offset; + int32_t output_mul; + int32_t output_shift; + int32_t output_offset; + xtl::span weights; + xtl::span bias; + + void deserialize(runtime::span_reader &reader) + { + reader.read(input); + reader.read(output); + reader.read(in_shape); + reader.read(padding_h); + reader.read(padding_w); + reader.read(filter_h); + reader.read(filter_w); + reader.read(stride_h); + reader.read(stride_w); + reader.read(dilation_h); + reader.read(dilation_w); + reader.read(input_offset); + reader.read(filter_offset); + reader.read(output_mul); + reader.read(output_shift); + reader.read(output_offset); + reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w); + reader.read_span(bias, in_shape[3]); + } + }; + } +} +} diff --git a/lib/nncase/include/targets/cpu/interpreter.h b/lib/nncase/include/targets/cpu/interpreter.h new file mode 100644 index 0000000..3289770 --- /dev/null +++ b/lib/nncase/include/targets/cpu/interpreter.h @@ -0,0 +1,17 @@ +#pragma once +#include + +namespace nncase +{ +namespace targets +{ + namespace cpu + { + class interpreter : public runtime::interpreter_base + { + public: + using interpreter_base::interpreter_base; + }; + } +} +} diff --git a/lib/nncase/include/targets/k210/interpreter.h b/lib/nncase/include/targets/k210/interpreter.h new file mode 100644 index 0000000..3dc1d0e --- /dev/null +++ b/lib/nncase/include/targets/k210/interpreter.h @@ -0,0 +1,44 @@ +#pragma once +#include "k210_sim_types.h" +#include + +namespace nncase +{ +namespace targets +{ + namespace k210 + { + struct k210_interpreter_context + { + runtime::interpreter_base *interpreter; + runtime::interpreter_step_t step; + }; + + class interpreter : public runtime::interpreter_base + { + public: + using interpreter_base::memory_at; + + interpreter(); + +#if !NNCASE_TARGET_K210_SIMULATOR + + dmac_channel_number_t dma_ch() const noexcept { return dma_ch_; } + void dma_ch(dmac_channel_number_t dma_ch) noexcept { dma_ch_ = dma_ch; } + k210_interpreter_context &context() noexcept { return context_; } +#endif + + protected: + xtl::span memory_at(const memory_range &range) const noexcept override; + + private: +#if NNCASE_TARGET_K210_SIMULATOR + std::unique_ptr kpu_mem_; +#else + dmac_channel_number_t dma_ch_; + k210_interpreter_context context_; +#endif + }; + } +} +} diff --git a/lib/nncase/include/targets/k210/k210_ops_body.h b/lib/nncase/include/targets/k210/k210_ops_body.h new file mode 100644 index 0000000..d32b2b5 --- /dev/null +++ b/lib/nncase/include/targets/k210/k210_ops_body.h @@ -0,0 +1,58 @@ +#pragma once +#include "../node_body.h" +#include "k210_runtime_op_utility.h" +#include "k210_sim_types.h" + +namespace nncase +{ +namespace targets +{ + namespace k210 + { + struct kpu_upload_options : simple_node_body + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + }; + + struct kpu_conv2d_options + { + memory_range main_mem_output; + int32_t batches; + int32_t reserved0; + kpu_layer_argument_t layer; + xtl::span batch_norm; + const kpu_activate_table_t *activation; + xtl::span weights; + + void deserialize(runtime::span_reader &reader) + { + reader.read(main_mem_output); + reader.read(batches); + reader.read(reserved0); + reader.read(layer); + + auto ic = layer.image_channel_num.data.i_ch_num + 1; + auto oc = layer.image_channel_num.data.o_ch_num + 1; + auto filter = get_kpu_filter_size((kpu_filter_type_t)layer.kernel_pool_type_cfg.data.kernel_type); + auto weights_size = layer.interrupt_enabe.data.depth_wise_layer + ? oc * filter * filter + : ic * oc * filter * filter; + + reader.skip(layer.kernel_pool_type_cfg.data.bwsx_base_addr); + reader.read_span(batch_norm, oc); + reader.skip(layer.kernel_calc_type_cfg.data.active_addr); + reader.get_ref(activation); + reader.skip(layer.kernel_load_cfg.data.para_start_addr); + reader.read_span(weights, weights_size); +#if !NNCASE_TARGET_K210_SIMULATOR + layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data(); + layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation; + layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data(); +#endif + } + }; + } +} +} diff --git a/lib/nncase/include/targets/k210/k210_runtime_op_utility.h b/lib/nncase/include/targets/k210/k210_runtime_op_utility.h new file mode 100644 index 0000000..6b5df2e --- /dev/null +++ b/lib/nncase/include/targets/k210/k210_runtime_op_utility.h @@ -0,0 +1,134 @@ +#pragma once +#include "k210_sim_types.h" + +namespace nncase +{ +namespace targets +{ + namespace k210 + { + struct kpu_layout + { + int32_t groups; + int32_t row_len; + int32_t row_pitch; + }; + + inline kpu_layout get_kpu_row_layout(int32_t width) + { + kpu_layout layout; + + if (width <= 16) + { + layout.groups = 4; + layout.row_len = 1; + layout.row_pitch = 16; + } + else if (width <= 32) + { + layout.groups = 2; + layout.row_len = 1; + layout.row_pitch = 32; + } + else + { + layout.groups = 1; + layout.row_len = (width + 63) / 64; + layout.row_pitch = 64; + } + + return layout; + } + + inline int32_t get_kpu_filter_size(kpu_filter_type_t filter) + { + switch (filter) + { + case kpu_filter_1x1: + return 1; + case kpu_filter_3x3: + return 3; + default: + return 0; + } + } + + inline int get_kpu_rows(int32_t width, int32_t height, int32_t channels) + { + auto layout = get_kpu_row_layout(width); + auto one_line_channels = std::min(channels, layout.groups); + auto blocks = (channels + one_line_channels - 1) / one_line_channels; + auto size = layout.row_len * height * blocks; + return size; + } + + inline int get_kpu_bytes(int32_t width, int32_t height, int32_t channels) + { + return get_kpu_rows(width, height, channels) * 64; + } + +#if NNCASE_TARGET_K210_SIMULATOR + + inline int32_t get_kpu_filter_size(kpu_pool_type_t filter) + { + switch (filter) + { + case kpu_pool_bypass: + return 1; + case kpu_pool_max_2_s2: + case kpu_pool_mean_2_s2: + case kpu_pool_left_top_2_s2: + case kpu_pool_right_top_2_s2: + case kpu_pool_max_2_s1: + case kpu_pool_mean_2_s1: + return 2; + case kpu_pool_max_4_s4: + case kpu_pool_mean_4_s4: + case kpu_pool_left_top_4_s4: + return 4; + } + } + + inline int32_t get_kpu_filter_stride(kpu_pool_type_t filter) + { + switch (filter) + { + case kpu_pool_bypass: + return 1; + case kpu_pool_max_2_s2: + case kpu_pool_mean_2_s2: + case kpu_pool_left_top_2_s2: + case kpu_pool_right_top_2_s2: + return 2; + case kpu_pool_max_2_s1: + case kpu_pool_mean_2_s1: + return 1; + case kpu_pool_max_4_s4: + case kpu_pool_mean_4_s4: + case kpu_pool_left_top_4_s4: + return 4; + } + } + + inline int32_t get_kpu_pool_output_size(int32_t input, kpu_pool_type_t pool_type) + { + return input / get_kpu_filter_stride(pool_type); + } + + inline std::array get_kpu_select_pool_offset(kpu_pool_type_t pool_type) + { + switch (pool_type) + { + case kpu_pool_left_top_2_s2: + return { 0, 0 }; + case kpu_pool_right_top_2_s2: + return { 0, 1 }; + case kpu_pool_left_top_4_s4: + return { 0, 0 }; + } + } + +#endif + } +} +} diff --git a/lib/nncase/include/targets/k210/k210_sim_types.h b/lib/nncase/include/targets/k210/k210_sim_types.h new file mode 100644 index 0000000..17398ec --- /dev/null +++ b/lib/nncase/include/targets/k210/k210_sim_types.h @@ -0,0 +1,249 @@ +#pragma once +#include +#include + +#ifdef __riscv64 +#define NNCASE_TARGET_K210_SIMULATOR 0 +#include +#else +#define NNCASE_TARGET_K210_SIMULATOR 1 +#endif + +namespace nncase +{ +namespace targets +{ + namespace k210 + { +#if NNCASE_TARGET_K210_SIMULATOR + typedef struct + { + union { + uint64_t reg; + struct + { + uint64_t int_en : 1; + uint64_t ram_flag : 1; + uint64_t full_add : 1; + uint64_t depth_wise_layer : 1; + uint64_t reserved : 60; + } data; + } interrupt_enabe; + + union { + uint64_t reg; + struct + { + uint64_t image_src_addr : 15; + uint64_t reserved0 : 17; + uint64_t image_dst_addr : 15; + uint64_t reserved1 : 17; + } data; + } image_addr; + + union { + uint64_t reg; + struct + { + uint64_t i_ch_num : 10; + uint64_t reserved0 : 22; + uint64_t o_ch_num : 10; + uint64_t reserved1 : 6; + uint64_t o_ch_num_coef : 10; + uint64_t reserved2 : 6; + } data; + } image_channel_num; + + union { + uint64_t reg; + struct + { + uint64_t i_row_wid : 10; + uint64_t i_col_high : 9; + uint64_t reserved0 : 13; + uint64_t o_row_wid : 10; + uint64_t o_col_high : 9; + uint64_t reserved1 : 13; + } data; + } image_size; + + union { + uint64_t reg; + struct + { + uint64_t kernel_type : 3; + uint64_t pad_type : 1; + uint64_t pool_type : 4; + uint64_t first_stride : 1; + uint64_t bypass_conv : 1; + uint64_t load_para : 1; + uint64_t reserved0 : 5; + uint64_t dma_burst_size : 8; + uint64_t pad_value : 8; + uint64_t bwsx_base_addr : 32; + } data; + } kernel_pool_type_cfg; + + union { + uint64_t reg; + struct + { + uint64_t load_coor : 1; + uint64_t load_time : 6; + uint64_t reserved0 : 8; + uint64_t para_size : 17; + uint64_t para_start_addr : 32; + } data; + } kernel_load_cfg; + + union { + uint64_t reg; + struct + { + uint64_t coef_column_offset : 4; + uint64_t coef_row_offset : 12; + uint64_t reserved0 : 48; + } data; + } kernel_offset; + + union { + uint64_t reg; + struct + { + uint64_t channel_switch_addr : 15; + uint64_t reserved : 1; + uint64_t row_switch_addr : 4; + uint64_t coef_size : 8; + uint64_t coef_group : 3; + uint64_t load_act : 1; + uint64_t active_addr : 32; + } data; + } kernel_calc_type_cfg; + + union { + uint64_t reg; + struct + { + uint64_t wb_channel_switch_addr : 15; + uint64_t reserved0 : 1; + uint64_t wb_row_switch_addr : 4; + uint64_t wb_group : 3; + uint64_t reserved1 : 41; + } data; + } write_back_cfg; + + union { + uint64_t reg; + struct + { + uint64_t shr_w : 4; + uint64_t shr_x : 4; + uint64_t arg_w : 24; + uint64_t arg_x : 24; + uint64_t reserved0 : 8; + } data; + } conv_value; + + union { + uint64_t reg; + struct + { + uint64_t arg_add : 40; + uint64_t reserved : 24; + } data; + } conv_value2; + + union { + uint64_t reg; + struct + { + uint64_t send_data_out : 1; + uint64_t reserved : 15; + uint64_t channel_byte_num : 16; + uint64_t dma_total_byte : 32; + } data; + } dma_parameter; + } kpu_layer_argument_t; + + typedef struct + { + union { + uint64_t reg; + struct + { + uint64_t shift_number : 8; + uint64_t y_mul : 16; + uint64_t x_start : 36; + } data; + } activate_para[16]; + + union { + uint64_t reg; + struct + { + uint8_t result_bias[8]; + } data; + } activate_para_bias0; + + union { + uint64_t reg; + struct + { + uint8_t result_bias[8]; + } data; + } activate_para_bias1; + } kpu_activate_table_t; +#endif + + typedef struct + { + union { + uint64_t reg; + struct + { + uint64_t norm_mul : 24; + uint64_t norm_add : 32; + uint64_t norm_shift : 4; + } data; + } batchnorm; + } kpu_batchnorm_argument_t; + + typedef enum _kpu_filter_type + { + kpu_filter_1x1 = 0, + kpu_filter_3x3 = 1 + } kpu_filter_type_t; + + typedef enum _kpu_pool_type + { + kpu_pool_bypass = 0, + kpu_pool_max_2_s2 = 1, + kpu_pool_mean_2_s2 = 2, + kpu_pool_max_4_s4 = 3, + kpu_pool_mean_4_s4 = 4, + kpu_pool_left_top_2_s2 = 5, + kpu_pool_right_top_2_s2 = 6, + kpu_pool_left_top_4_s4 = 7, + kpu_pool_mean_2_s1 = 8, + kpu_pool_max_2_s1 = 9 + } kpu_pool_type_t; + + struct kpu_batchnorm_segment + { + int32_t mul; + int32_t shift; + int32_t add; + }; + + struct kpu_activation_segment + { + int64_t start_x; + int32_t mul; + int32_t shift; + int32_t add; + }; + + using kpu_activation_table_t = std::array; + } +} +} diff --git a/lib/nncase/include/targets/neutral/neutral_ops_body.h b/lib/nncase/include/targets/neutral/neutral_ops_body.h new file mode 100644 index 0000000..d4ea798 --- /dev/null +++ b/lib/nncase/include/targets/neutral/neutral_ops_body.h @@ -0,0 +1,258 @@ +#pragma once +#include "../node_body.h" + +namespace nncase +{ +namespace targets +{ + namespace neutral + { + struct binary_options : public simple_node_body + { + memory_range input_a; + memory_range input_b; + memory_range output; + binary_op_t binary_op; + runtime_shape_t in_a_shape; + runtime_shape_t in_b_shape; + runtime_shape_t out_shape; + value_range fused_activation; + }; + + struct concat_options + { + memory_range output; + uint32_t inner_size; + uint32_t outer_size; + uint32_t inputs_count; + xtl::span inputs; + xtl::span dims; + + void deserialize(runtime::span_reader &reader) + { + reader.read(output); + reader.read(inner_size); + reader.read(outer_size); + reader.read(inputs_count); + reader.read_span(inputs, inputs_count); + reader.read_span(dims, inputs_count); + } + + void serialize(runtime::binary_writer &writer) const + { + writer.write(output); + writer.write(inner_size); + writer.write(outer_size); + writer.write(inputs_count); + writer.write_array(inputs); + writer.write_array(dims); + } + }; + + struct conv2d_options + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + int32_t groups; + int32_t out_channels; + padding padding_h; + padding padding_w; + int32_t filter_h; + int32_t filter_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + value_range fused_activation; + xtl::span weights; + xtl::span bias; + + void deserialize(runtime::span_reader &reader) + { + reader.read(input); + reader.read(output); + reader.read(in_shape); + reader.read(groups); + reader.read(out_channels); + reader.read(padding_h); + reader.read(padding_w); + reader.read(filter_h); + reader.read(filter_w); + reader.read(stride_h); + reader.read(stride_w); + reader.read(dilation_h); + reader.read(dilation_w); + reader.read(fused_activation); + reader.read_span(weights, (size_t)out_channels * in_shape[1] / groups * filter_h * filter_w); + reader.read_span(bias, out_channels); + } + + void serialize(runtime::binary_writer &writer) const + { + writer.write(input); + writer.write(output); + writer.write(in_shape); + writer.write(groups); + writer.write(out_channels); + writer.write(padding_h); + writer.write(padding_w); + writer.write(filter_h); + writer.write(filter_w); + writer.write(stride_h); + writer.write(stride_w); + writer.write(dilation_h); + writer.write(dilation_w); + writer.write(fused_activation); + writer.write_array(weights); + writer.write_array(bias); + } + }; + + struct dequantize_options : public simple_node_body + { + memory_range input; + memory_range output; + quant_param_t quant_param; + }; + + struct matmul_options + { + memory_range input_a; + memory_range input_b; + memory_range output; + int32_t a_rows; + int32_t a_cols; + int32_t b_cols; + value_range fused_activation; + xtl::span bias; + + void deserialize(runtime::span_reader &reader) + { + reader.read(input_a); + reader.read(input_b); + reader.read(output); + reader.read(a_rows); + reader.read(a_cols); + reader.read(b_cols); + reader.read(fused_activation); + reader.read_span(bias, b_cols); + } + + void serialize(runtime::binary_writer &writer) const + { + writer.write(input_a); + writer.write(input_b); + writer.write(output); + writer.write(a_rows); + writer.write(a_cols); + writer.write(b_cols); + writer.write(fused_activation); + writer.write_array(bias); + } + }; + + struct memory_copy_options : public simple_node_body + { + memory_range input; + memory_range output; + }; + + struct pad_options : public simple_node_body + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + runtime_paddings_t paddings; + scalar pad_value; + }; + + struct quantize_options : public simple_node_body + { + memory_range input; + memory_range output; + quant_param_t quant_param; + }; + + struct reduce_options : public simple_node_body + { + memory_range input; + memory_range output; + reduce_op_t reduce_op; + runtime_shape_t in_shape; + runtime_shape_t out_shape; + float init_value; + }; + + struct reduce_window2d_options : simple_node_body + { + memory_range input; + memory_range output; + reduce_op_t reduce_op; + runtime_shape_t in_shape; + padding padding_h; + padding padding_w; + int32_t filter_h; + int32_t filter_w; + int32_t stride_h; + int32_t stride_w; + int32_t dilation_h; + int32_t dilation_w; + float init_value; + value_range fused_activation; + }; + + struct resize_bilinear_options : public simple_node_body + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + int32_t out_h; + int32_t out_w; + bool align_corners; + }; + + struct resize_nearest_neighbor_options : public simple_node_body + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + int32_t out_h; + int32_t out_w; + bool align_corners; + }; + + struct softmax_options : public simple_node_body + { + memory_range input; + memory_range output; + int32_t inner_size; + int32_t outer_size; + float beta; + }; + + struct transpose_options : public simple_node_body + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + runtime_shape_t perm; + }; + + struct strided_slice_options : public simple_node_body + { + memory_range input; + memory_range output; + runtime_shape_t in_shape; + runtime_shape_t begin; + runtime_shape_t end; + runtime_shape_t strides; + int32_t begin_mask; + int32_t end_mask; + int32_t ellipsis_mask; + int32_t new_axis_mask; + int32_t shrink_axis_mask; + }; + } +} +} diff --git a/lib/nncase/include/targets/node_body.h b/lib/nncase/include/targets/node_body.h new file mode 100644 index 0000000..7920fe5 --- /dev/null +++ b/lib/nncase/include/targets/node_body.h @@ -0,0 +1,24 @@ +#pragma once +#include "../runtime/binary_writer.h" +#include "../runtime/span_reader.h" +#include + +namespace nncase +{ +namespace targets +{ + template + struct simple_node_body + { + void deserialize(runtime::span_reader &reader) + { + reader.read(static_cast(*this)); + } + + void serialize(runtime::binary_writer &writer) const + { + writer.write(static_cast(*this)); + } + }; +} +} diff --git a/lib/nncase/nncase.cpp b/lib/nncase/nncase.cpp new file mode 100644 index 0000000..13115e4 --- /dev/null +++ b/lib/nncase/nncase.cpp @@ -0,0 +1,116 @@ +/* Copyright 2018 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; + +class nncase_context +{ +public: + int load_kmodel(const uint8_t *buffer) + { + return interpreter_.try_load_model(buffer) ? 0 : -1; + } + + int get_output(uint32_t index, uint8_t **data, size_t *size) + { + if (index >= interpreter_.outputs_size()) + return -1; + + auto mem = interpreter_.memory_at(interpreter_.output_at(index)); + *data = mem.data(); + *size = mem.size(); + return 0; + } + + int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata) + { + done_callback_ = done_callback; + userdata_ = userdata; + interpreter_.dma_ch(dma_ch); + + auto input = interpreter_.input_at(0); + auto mem = interpreter_.memory_at(input); + std::copy(src, src + mem.size(), mem.begin()); + interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this); + return 0; + } + +private: + void on_done() + { + printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6); + + if (done_callback_) + done_callback_(userdata_); + } + + static void done_thunk(void *userdata) + { + reinterpret_cast(userdata)->on_done(); + } + + static void on_error_thunk(const char *err, void *userdata) + { + printf("Fatal: %s\n", err); + } + + static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata) + { + printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6); + } + +private: + interpreter_t interpreter_; + kpu_done_callback_t done_callback_; + void *userdata_; +}; + +int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer) +{ + auto nnctx = new (std::nothrow) nncase_context(); + if (ctx) + { + ctx->is_nncase = 1; + ctx->nncase_ctx = nnctx; + return nnctx->load_kmodel(buffer); + } + else + { + return -1; + } +} + +int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size) +{ + auto nnctx = reinterpret_cast(ctx->nncase_ctx); + return nnctx->get_output(index, data, size); +} + +void nncase_model_free(kpu_model_context_t *ctx) +{ + auto nnctx = reinterpret_cast(ctx->nncase_ctx); + delete nnctx; + ctx->nncase_ctx = nullptr; +} + +int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata) +{ + auto nnctx = reinterpret_cast(ctx->nncase_ctx); + return nnctx->run_kmodel(src, dma_ch, done_callback, userdata); +} diff --git a/lib/nncase/runtime/interpreter.cpp b/lib/nncase/runtime/interpreter.cpp new file mode 100644 index 0000000..a993a1d --- /dev/null +++ b/lib/nncase/runtime/interpreter.cpp @@ -0,0 +1,131 @@ +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; + +bool interpreter_base::try_load_model(const uint8_t *buffer) +{ + auto offset = buffer; + model_header_ = reinterpret_cast(buffer); + + // Validate model + if (model_header_->identifier != MODEL_IDENTIFIER || model_header_->version != MODEL_VERSION || (model_header_->target != MODEL_TARGET_CPU && model_header_->target != MODEL_TARGET_K210)) + return false; + + // Allocate buffers + main_mem_.reset(new (std::nothrow) uint8_t[model_header_->main_mem]); + if (!main_mem_) + return false; + + offset += sizeof(model_header); + inputs_ = { reinterpret_cast(offset), inputs_size() }; + offset += sizeof(memory_range) * inputs_size(); + input_shapes_ = { reinterpret_cast(offset), inputs_size() }; + offset += sizeof(runtime_shape_t) * inputs_size(); + outputs_ = { reinterpret_cast(offset), outputs_size() }; + offset += sizeof(memory_range) * outputs_size(); + constants_ = { offset, model_header_->constants }; + offset += constants_.size(); + node_headers_ = { reinterpret_cast(offset), nodes_size() }; + offset += sizeof(node_header) * nodes_size(); + node_body_start_ = offset; + + return initialize(); +} + +bool interpreter_base::initialize() +{ + return true; +} + +void interpreter_base::run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata) +{ + run_callback_ = callback; + on_error_ = on_error; + node_profile_ = node_profile; + userdata_ = userdata; + cnt_node_ = 0; + cnt_node_body_ = node_body_start_; + total_duration_ = {}; + last_time_.reset(); + step(); +} + +void interpreter_base::step() +{ + auto result = kcr_done; + + while (result == kcr_done) + { + if (!last_time_) + { + last_time_ = clock_t::now(); + } + else + { + auto now = clock_t::now(); + auto duration = now - *last_time_; + total_duration_ += duration; + last_time_ = now; + + if (node_profile_) + node_profile_(last_op_, duration, userdata_); + } + + if (cnt_node_ == nodes_size()) + { + run_callback_(userdata_); + break; + } + else + { + auto node_id = cnt_node_++; + auto header = node_headers_[node_id]; + xtl::span body(cnt_node_body_, header.body_size); + cnt_node_body_ += header.body_size; + last_op_ = header.opcode; + + result = call_kernel(header.opcode, body, static_cast(*this), &interpreter_base::step); + + if (result == kcr_error) + { + if (on_error_) + { + char buffer[256]; + auto name = node_opcode_names(header.opcode); + if (!name.empty()) + std::sprintf(buffer, "error occurs in running kernel: %s", name.data()); + else + std::sprintf(buffer, "Unknown opcode: (%d)", header.opcode); + on_error_(buffer, userdata_); + } + + break; + } + } + } +} + +xtl::span interpreter_base::memory_at(const memory_range &range) const noexcept +{ + uintptr_t base; + + switch (range.memory_type) + { + case mem_const: + base = (uintptr_t)constants_.data(); + break; + case mem_main: + base = (uintptr_t)main_mem_.get(); + break; + default: + base = 0; + assert(!"Invalid memory type"); + break; + } + + return { reinterpret_cast(base + range.start), range.size }; +} diff --git a/lib/nncase/runtime/kernel_registry.cpp b/lib/nncase/runtime/kernel_registry.cpp new file mode 100644 index 0000000..e7e1ffe --- /dev/null +++ b/lib/nncase/runtime/kernel_registry.cpp @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; + +namespace nncase +{ +namespace targets +{ +#define BEGINE_DEFINE_TARGET(target) \ + namespace target \ + { + +#define DEFINE_RUNTIME_OP(target, id, name, value) \ + kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t); + +#define END_DEFINE_TARGET() } + +#include + +#undef BEGINE_DEFINE_TARGET +#undef DEFINE_RUNTIME_OP +#undef END_DEFINE_TARGET +} +} + +kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span body, interpreter_t &interpreter, interpreter_step_t step) +{ + span_reader reader(body); + + switch (opcode) + { +#define BEGINE_DEFINE_TARGET(...) +#define DEFINE_RUNTIME_OP(target, id, name, value) \ + case rop_##id: \ + { \ + nncase::targets::target::id##_options options; \ + options.deserialize(reader); \ + return nncase::targets::target::id(options, interpreter, step); \ + } +#define END_DEFINE_TARGET() + +#include + +#undef BEGINE_DEFINE_TARGET +#undef DEFINE_RUNTIME_OP +#undef END_DEFINE_TARGET + default: + return kcr_error; + } +} diff --git a/lib/nncase/targets/cpu/cpu_ops.cpp b/lib/nncase/targets/cpu/cpu_ops.cpp new file mode 100644 index 0000000..d83fa84 --- /dev/null +++ b/lib/nncase/targets/cpu/cpu_ops.cpp @@ -0,0 +1,79 @@ +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; + +namespace nncase +{ +namespace targets +{ + namespace cpu + { + kernel_call_result cpu_conv2d(cpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + kernels::cpu::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h, + options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation); + return kcr_done; + } + + kernel_call_result cpu_depthwise_conv2d(cpu_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + kernels::cpu::depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h, + options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation); + return kcr_done; + } + + runtime::kernel_call_result cpu_reduce_window2d(cpu_reduce_window2d_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + auto reduce = [&](auto binary_op, auto window_op) { + kernels::cpu::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h, + options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op); + }; + + switch (options.reduce_op) + { + case reduce_mean: + reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; }); + return runtime::kcr_done; + case reduce_min: + reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; }); + return runtime::kcr_done; + case reduce_max: + reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; }); + return kcr_done; + default: + return kcr_error; + } + } + + kernel_call_result cpu_quantized_conv2d(cpu_quantized_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + kernels::cpu::quantized_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h, + options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, + options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset); + return kcr_done; + } + + kernel_call_result cpu_quantized_depthwise_conv2d(cpu_quantized_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + kernels::cpu::quantized_depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h, + options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, + options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset); + return kcr_done; + } + } +} +} diff --git a/lib/nncase/targets/k210/interpreter.cpp b/lib/nncase/targets/k210/interpreter.cpp new file mode 100644 index 0000000..7155e14 --- /dev/null +++ b/lib/nncase/targets/k210/interpreter.cpp @@ -0,0 +1,36 @@ +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::targets::k210; + +interpreter::interpreter() +#if NNCASE_TARGET_K210_SIMULATOR + : kpu_mem_(std::make_unique(2 * 1024 * 1024)) +#endif +{ +#if !NNCASE_TARGET_K210_SIMULATOR + kpu->interrupt_clear.reg = 7; + kpu->interrupt_mask.reg = 7; + kpu->fifo_threshold.reg = 10 | (1 << 4); + kpu->eight_bit_mode.reg = 1; + + plic_set_priority(IRQN_AI_INTERRUPT, 1); +#endif +} + +xtl::span interpreter::memory_at(const memory_range &range) const noexcept +{ + if (range.memory_type == mem_k210_kpu) + { + uintptr_t base = +#if NNCASE_TARGET_K210_SIMULATOR + (uintptr_t)kpu_mem_.get(); +#else + (uintptr_t)AI_IO_BASE_ADDR; +#endif + return { reinterpret_cast(base + range.start), range.size }; + } + + return interpreter_base::memory_at(range); +} diff --git a/lib/nncase/targets/k210/k210_ops.cpp b/lib/nncase/targets/k210/k210_ops.cpp new file mode 100644 index 0000000..d7a092d --- /dev/null +++ b/lib/nncase/targets/k210/k210_ops.cpp @@ -0,0 +1,179 @@ +#include +#include +#include +#if !NNCASE_TARGET_K210_SIMULATOR +#include +#include +#endif + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::targets::k210; + +namespace +{ +#if !NNCASE_TARGET_K210_SIMULATOR +void kpu_send_layer(const kpu_layer_argument_t &layer) +{ + kpu->layer_argument_fifo = layer.interrupt_enabe.reg; + kpu->layer_argument_fifo = layer.image_addr.reg; + kpu->layer_argument_fifo = layer.image_channel_num.reg; + kpu->layer_argument_fifo = layer.image_size.reg; + kpu->layer_argument_fifo = layer.kernel_pool_type_cfg.reg; + kpu->layer_argument_fifo = layer.kernel_load_cfg.reg; + kpu->layer_argument_fifo = layer.kernel_offset.reg; + kpu->layer_argument_fifo = layer.kernel_calc_type_cfg.reg; + kpu->layer_argument_fifo = layer.write_back_cfg.reg; + kpu->layer_argument_fifo = layer.conv_value.reg; + kpu->layer_argument_fifo = layer.conv_value2.reg; + kpu->layer_argument_fifo = layer.dma_parameter.reg; +} + +void kpu_conv2d_normal(kpu_layer_argument_t &layer, plic_irq_callback_t callback, void *userdata) +{ + kpu->interrupt_clear.reg = 0b111; + kpu->interrupt_mask.reg = 0b110; + layer.interrupt_enabe.data.int_en = 1; + plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata); + plic_irq_enable(IRQN_AI_INTERRUPT); + kpu_send_layer(layer); +} + +void kpu_conv2d_output(kpu_layer_argument_t &layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata) +{ + kpu->interrupt_clear.reg = 0b111; + kpu->interrupt_mask.reg = 0b111; + layer.dma_parameter.data.send_data_out = 1; + sysctl_dma_select((sysctl_dma_channel_t)dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ); + dmac_set_irq(dma_ch, callback, userdata, 1); + dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, + DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8); + kpu_send_layer(layer); +} + +int kpu_plic_thunk(void *userdata) +{ + kpu->interrupt_clear.reg = 0b111; + kpu->interrupt_mask.reg = 0b111; + + auto &ctx = *reinterpret_cast(userdata); + (ctx.interpreter->*ctx.step)(); + return 0; +} +#endif +} + +namespace nncase +{ +namespace targets +{ + namespace k210 + { + kernel_call_result kpu_upload(kpu_upload_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + kernels::k210::kpu_upload(input.data(), output.data(), options.in_shape); + return kcr_done; + } + + kernel_call_result kpu_conv2d(kpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step) + { +#if NNCASE_TARGET_K210_SIMULATOR + auto input = interpreter.memory_at({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_src_addr * 64, 1 }); + auto kpu_out = interpreter.memory_at({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_dst_addr * 64, 1 }); + + auto in_h = static_cast(options.layer.image_size.data.i_col_high + 1); + auto in_w = static_cast(options.layer.image_size.data.i_row_wid + 1); + auto in_ch = static_cast(options.layer.image_channel_num.data.i_ch_num + 1); + runtime_shape_t in_shape { options.batches, in_ch, in_h, in_w }; + auto in_fmap_size = kernels::details::compute_size(in_shape); + + auto out_h = static_cast(options.layer.image_size.data.o_col_high + 1); + auto out_w = static_cast(options.layer.image_size.data.o_row_wid + 1); + auto out_ch = static_cast(options.layer.image_channel_num.data.o_ch_num + 1); + runtime_shape_t conv_out_shape { options.batches, out_ch, in_h, in_w }; + auto conv_out_fmap_size = kernels::details::compute_size(conv_out_shape); + runtime_shape_t out_shape { options.batches, out_ch, out_h, out_w }; + auto out_fmap_size = kernels::details::compute_size(out_shape); + + auto input_tmp = std::make_unique(in_fmap_size); + auto workspace = std::make_unique(conv_out_fmap_size); + auto conv_output_tmp = std::make_unique(conv_out_fmap_size); + auto output_tmp = std::make_unique(out_fmap_size); + + kernels::k210::kpu_download(input.data(), input_tmp.get(), in_shape); + auto is_depthwise = options.layer.interrupt_enabe.data.depth_wise_layer != 0; + auto filter_size = get_kpu_filter_size((kpu_filter_type_t)options.layer.kernel_pool_type_cfg.data.kernel_type); + auto pad_value = (uint8_t)options.layer.kernel_pool_type_cfg.data.pad_value; + auto arg_x = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_x); + auto shift_x = (int32_t)options.layer.conv_value.data.shr_x; + auto arg_w = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_w); + auto shift_w = (int32_t)options.layer.conv_value.data.shr_w; + auto arg_add = kernels::details::to_signed<40>(options.layer.conv_value2.data.arg_add); + + auto batchnorm = std::make_unique(out_ch); + for (size_t i = 0; i < out_ch; i++) + { + auto &src = options.batch_norm[i].batchnorm.data; + auto &dest = batchnorm[i]; + dest.mul = (int32_t)kernels::details::to_signed<24>(src.norm_mul); + dest.shift = (int32_t)src.norm_shift; + dest.add = (int32_t)kernels::details::to_signed<32>(src.norm_add); + } + + kpu_activation_table_t activation; + for (size_t i = 0; i < 16; i++) + { + auto &src = options.activation->activate_para[i].data; + auto &dest = activation[i]; + dest.start_x = kernels::details::to_signed<36>(src.x_start); + dest.mul = (int32_t)kernels::details::to_signed<16>(src.y_mul); + dest.shift = (int32_t)src.shift_number; + + if (i < 16) + dest.add = options.activation->activate_para_bias0.data.result_bias[i]; + else + dest.add = options.activation->activate_para_bias1.data.result_bias[i - 16]; + } + +#define KPU_CONV2D_IMPL(is_depthwise_val, filter_size_val) \ + if (is_depthwise == is_depthwise_val && filter_size == filter_size_val) \ + kernels::k210::kpu_conv2d(input_tmp.get(), workspace.get(), conv_output_tmp.get(), options.weights.data(), \ + in_h, in_w, in_ch, out_ch, pad_value, arg_x, shift_x, arg_w, shift_w, arg_add, batchnorm.get(), activation) + + KPU_CONV2D_IMPL(true, 1); + else KPU_CONV2D_IMPL(true, 3); + else KPU_CONV2D_IMPL(false, 1); + else KPU_CONV2D_IMPL(false, 3); + + kernels::k210::kpu_pool2d(conv_output_tmp.get(), output_tmp.get(), in_h, in_w, out_ch, (kpu_pool_type_t)options.layer.kernel_pool_type_cfg.data.pool_type); + kernels::k210::kpu_upload(output_tmp.get(), kpu_out.data(), out_shape); + if (options.main_mem_output.size) + { + auto main_output = interpreter.memory_at(options.main_mem_output); + std::copy(output_tmp.get(), output_tmp.get() + out_fmap_size, main_output.data()); + } + + return kcr_done; +#else + auto &ctx = interpreter.context(); + ctx.interpreter = &interpreter; + ctx.step = step; + + if (options.main_mem_output.size) + { + auto main_output = interpreter.memory_at(options.main_mem_output); + kpu_conv2d_output(options.layer, interpreter.dma_ch(), main_output.data(), kpu_plic_thunk, &ctx); + } + else + { + kpu_conv2d_normal(options.layer, kpu_plic_thunk, &ctx); + } + + return kcr_async; +#endif + } + } +} +} diff --git a/lib/nncase/targets/neutral/neutral_ops.cpp b/lib/nncase/targets/neutral/neutral_ops.cpp new file mode 100644 index 0000000..f240666 --- /dev/null +++ b/lib/nncase/targets/neutral/neutral_ops.cpp @@ -0,0 +1,238 @@ +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; + +#define ELEM_SIZE_IMPL(type, KERNEL) \ + switch (runtime::get_bytes(type)) \ + { \ + case 1: \ + KERNEL(uint8_t); \ + break; \ + case 2: \ + KERNEL(uint16_t); \ + break; \ + case 4: \ + KERNEL(uint32_t); \ + break; \ + default: \ + return kcr_error; \ + } + +namespace nncase +{ +namespace targets +{ + namespace neutral + { + kernel_call_result binary(binary_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input_a = interpreter.memory_at(options.input_a); + auto input_b = interpreter.memory_at(options.input_b); + auto output = interpreter.memory_at(options.output); + + auto binary = [&](auto op) { + kernels::neutral::binary(input_a.data(), input_b.data(), output.data(), options.in_a_shape, options.in_b_shape, options.out_shape, options.fused_activation, op); + }; + + switch (options.binary_op) + { + case binary_add: + binary([](auto a, auto b) { return a + b; }); + return kcr_done; + case binary_sub: + binary([](auto a, auto b) { return a - b; }); + return kcr_done; + case binary_mul: + binary([](auto a, auto b) { return a * b; }); + return kcr_done; + case binary_div: + binary([](auto a, auto b) { return a / b; }); + return kcr_done; + default: + return kcr_error; + } + } + + kernel_call_result concat(concat_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto output = interpreter.memory_at(options.output); + kernels::neutral::concat(options.inputs, output.data(), options.dims, options.inner_size, options.outer_size, + [&](const memory_range &range) { return interpreter.memory_at(range).data(); }); + return kcr_done; + } + + kernel_call_result conv2d(conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + kernels::neutral::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.groups, options.out_channels, options.filter_h, + options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation); + return kcr_done; + } + + kernel_call_result dequantize(dequantize_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + kernels::neutral::dequantize(input.data(), output.data(), input.size(), options.quant_param); + return kcr_done; + } + + kernel_call_result matmul(matmul_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input_a = interpreter.memory_at(options.input_a); + auto input_b = interpreter.memory_at(options.input_b); + auto output = interpreter.memory_at(options.output); + kernels::neutral::matmul(input_a.data(), input_b.data(), output.data(), options.bias.data(), options.a_rows, options.a_cols, options.b_cols, options.fused_activation); + return kcr_done; + } + + kernel_call_result memory_copy(memory_copy_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + std::copy(input.begin(), input.end(), output.begin()); + return kcr_done; + } + + kernel_call_result pad(pad_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + +#define PAD_KERNEL(T) \ + kernels::neutral::pad(reinterpret_cast(input.data()), reinterpret_cast(output.data()), options.in_shape, options.paddings, options.pad_value.as()); + + ELEM_SIZE_IMPL(options.input.datatype, PAD_KERNEL); + return kcr_done; +#undef PAD_KERNEL + } + + kernel_call_result quantize(quantize_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + kernels::neutral::quantize(input.data(), output.data(), input.size(), options.quant_param); + return runtime::kcr_done; + } + + kernel_call_result reduce(reduce_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + auto reduce = [&](auto op) { + kernels::neutral::reduce(input.data(), output.data(), options.init_value, options.in_shape, options.out_shape, op); + }; + + switch (options.reduce_op) + { + case reduce_mean: + { + reduce([](auto a, auto b) { return a + b; }); + auto mul = (float)output.size() / input.size(); + kernels::neutral::unary(output.data(), output.data(), output.size(), [mul](auto a) { return a * mul; }); + return kcr_done; + } + case reduce_min: + reduce([](auto a, auto b) { return std::min(a, b); }); + return kcr_done; + case reduce_max: + reduce([](auto a, auto b) { return std::max(a, b); }); + return kcr_done; + default: + return kcr_error; + } + } + + kernel_call_result reduce_window2d(reduce_window2d_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + auto reduce = [&](auto binary_op, auto window_op) { + kernels::neutral::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h, + options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op); + }; + + switch (options.reduce_op) + { + case reduce_mean: + reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; }); + return kcr_done; + case reduce_min: + reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; }); + return kcr_done; + case reduce_max: + reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; }); + return kcr_done; + default: + return kcr_error; + } + } + + kernel_call_result resize_bilinear(resize_bilinear_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + kernels::neutral::resize_bilinear(input.data(), output.data(), options.in_shape, options.out_h, options.out_w, options.align_corners); + return kcr_done; + } + + kernel_call_result resize_nearest_neighbor(resize_nearest_neighbor_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + +#define RESIZE_NN_KERNEL(T) \ + kernels::neutral::resize_nearest_neighbor(reinterpret_cast(input.data()), reinterpret_cast(output.data()), options.in_shape, options.out_h, options.out_w); + + ELEM_SIZE_IMPL(options.input.datatype, RESIZE_NN_KERNEL); + return kcr_done; +#undef RESIZE_NN_KERNEL + } + + kernel_call_result softmax(softmax_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + + kernels::neutral::softmax(input.data(), output.data(), options.beta, options.outer_size, options.inner_size); + return kcr_done; + } + + kernel_call_result transpose(transpose_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + +#define TRANSPOSE_KERNEL(T) \ + kernels::neutral::transpose(reinterpret_cast(input.data()), reinterpret_cast(output.data()), options.in_shape, options.perm); + + ELEM_SIZE_IMPL(options.input.datatype, TRANSPOSE_KERNEL); + return kcr_done; +#undef TRANSPOSE_KERNEL + } + + kernel_call_result strided_slice(strided_slice_options &options, interpreter_t &interpreter, interpreter_step_t step) + { + auto input = interpreter.memory_at(options.input); + auto output = interpreter.memory_at(options.output); + +#define STRIDED_SLICE_KERNEL(T) \ + kernels::neutral::strided_slice(reinterpret_cast(input.data()), reinterpret_cast(output.data()), options.in_shape, options.begin, options.end, options.strides); + + ELEM_SIZE_IMPL(options.input.datatype, STRIDED_SLICE_KERNEL); + return kcr_done; +#undef STRIDED_SLICE_KERNEL + } + } +} +} diff --git a/third_party/xtl/LICENSE b/third_party/xtl/LICENSE new file mode 100644 index 0000000..936ae84 --- /dev/null +++ b/third_party/xtl/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2017, Sylvain Corlay and Johan Mabille +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/xtl/README.md b/third_party/xtl/README.md new file mode 100644 index 0000000..003b428 --- /dev/null +++ b/third_party/xtl/README.md @@ -0,0 +1,66 @@ +# ![xtl](docs/source/xtl.svg) + +[![Travis](https://travis-ci.org/QuantStack/xtl.svg?branch=master)](https://travis-ci.org/QuantStack/xtl) +[![Appveyor](https://ci.appveyor.com/api/projects/status/g9bldap2wirlue9w?svg=true)](https://ci.appveyor.com/project/QuantStack/xtl) +[![Azure](https://dev.azure.com/johanmabille/johanmabille/_apis/build/status/QuantStack.xtl?branchName=master)](https://dev.azure.com/johanmabille/johanmabille/_build/latest?definitionId=1&branchName=master) +[![Documentation Status](http://readthedocs.org/projects/xtl/badge/?version=latest)](https://xtl.readthedocs.io/en/latest/?badge=latest) +[![Join the Gitter Chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/QuantStack/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + +Basic tools (containers, algorithms) used by other quantstack packages + +## Installation + +`xtl` is a header-only library. We provide a package for the conda package manager. + +```bash +conda install -c conda-forge xtl +``` + +Or you can directly install it from the sources: + +```bash +cmake -DCMAKE_INSTALL_PREFIX=your_install_prefix +make install +``` + +## Documentation + +To get started with using `xtl`, check out the full documentation + +http://xtl.readthedocs.io/ + + +## Building the HTML documentation + +xtl's documentation is built with three tools + + - [doxygen](http://www.doxygen.org) + - [sphinx](http://www.sphinx-doc.org) + - [breathe](https://breathe.readthedocs.io) + +While doxygen must be installed separately, you can install breathe by typing + +```bash +pip install breathe +``` + +Breathe can also be installed with `conda` + +```bash +conda install -c conda-forge breathe +``` + +Finally, build the documentation with + +```bash +make html +``` + +from the `docs` subdirectory. + +## License + +We use a shared copyright model that enables all contributors to maintain the +copyright on their contributions. + +This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details. diff --git a/third_party/xtl/include/xtl/xspan.hpp b/third_party/xtl/include/xtl/xspan.hpp new file mode 100644 index 0000000..651c336 --- /dev/null +++ b/third_party/xtl/include/xtl/xspan.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** +* Copyright (c) 2016, Sylvain Corlay and Johan Mabille * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XTL_XSPAN_HPP +#define XTL_XSPAN_HPP + +#include "xspan_impl.hpp" + +namespace xtl +{ + using tcb::span; + constexpr std::ptrdiff_t dynamic_extent = tcb::dynamic_extent; +} + +#endif diff --git a/third_party/xtl/include/xtl/xspan_impl.hpp b/third_party/xtl/include/xtl/xspan_impl.hpp new file mode 100644 index 0000000..578854e --- /dev/null +++ b/third_party/xtl/include/xtl/xspan_impl.hpp @@ -0,0 +1,778 @@ +// https://github.com/tcbrindle/span/blob/master/include/tcb/span.hpp +// TCP SPAN @commit cd0c6d0 + +/* +This is an implementation of std::span from P0122R7 +http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0122r7.pdf +*/ + +// Copyright Tristan Brindle 2018. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file ../../LICENSE_1_0.txt or copy at +// https://www.boost.org/LICENSE_1_0.txt) + +#ifndef TCB_SPAN_HPP_INCLUDED +#define TCB_SPAN_HPP_INCLUDED + +#include +#include +#include + +#ifndef TCB_SPAN_NO_EXCEPTIONS +// Attempt to discover whether we're being compiled with exception support +#if !(defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) +#define TCB_SPAN_NO_EXCEPTIONS +#endif +#endif + +#ifndef TCB_SPAN_NO_EXCEPTIONS +#include +#include +#endif + +// Various feature test macros + +#ifndef TCB_SPAN_NAMESPACE_NAME +#define TCB_SPAN_NAMESPACE_NAME tcb +#endif + +#ifdef TCB_SPAN_STD_COMPLIANT_MODE +#define TCB_SPAN_NO_DEPRECATION_WARNINGS +#endif + +#ifndef TCB_SPAN_NO_DEPRECATION_WARNINGS +#define TCB_SPAN_DEPRECATED_FOR(msg) [[deprecated(msg)]] +#else +#define TCB_SPAN_DEPRECATED_FOR(msg) +#endif + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define TCB_SPAN_HAVE_CPP17 +#endif + +#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) +#define TCB_SPAN_HAVE_CPP14 +#endif + +namespace TCB_SPAN_NAMESPACE_NAME { + +// Establish default contract checking behavior +#if !defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION) && \ + !defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION) && \ + !defined(TCB_SPAN_NO_CONTRACT_CHECKING) +#if defined(NDEBUG) || !defined(TCB_SPAN_HAVE_CPP14) +#define TCB_SPAN_NO_CONTRACT_CHECKING +#else +#define TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION +#endif +#endif + +#if defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION) +struct contract_violation_error : std::logic_error { + explicit contract_violation_error(const char* msg) : std::logic_error(msg) + {} +}; + +inline void contract_violation(const char* msg) +{ + throw contract_violation_error(msg); +} + +#elif defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION) +[[noreturn]] inline void contract_violation(const char* /*unused*/) +{ + std::terminate(); +} +#endif + +#if !defined(TCB_SPAN_NO_CONTRACT_CHECKING) +#define TCB_SPAN_STRINGIFY(cond) #cond +#define TCB_SPAN_EXPECT(cond) \ + cond ? (void) 0 : contract_violation("Expected " TCB_SPAN_STRINGIFY(cond)) +#else +#define TCB_SPAN_EXPECT(cond) +#endif + +#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_inline_variables) +#define TCB_SPAN_INLINE_VAR inline +#else +#define TCB_SPAN_INLINE_VAR +#endif + +#if defined(TCB_SPAN_HAVE_CPP14) || \ + (defined(__cpp_constexpr) && __cpp_constexpr >= 201304) +#define TCB_SPAN_CONSTEXPR14 constexpr +#else +#define TCB_SPAN_CONSTEXPR14 +#endif + +#if defined(TCB_SPAN_NO_CONTRACT_CHECKING) +#define TCB_SPAN_CONSTEXPR11 constexpr +#else +#define TCB_SPAN_CONSTEXPR11 TCB_SPAN_CONSTEXPR14 +#endif + +#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_deduction_guides) +#define TCB_SPAN_HAVE_DEDUCTION_GUIDES +#endif + +#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_byte) +#define TCB_SPAN_HAVE_STD_BYTE +#endif + +#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_array_constexpr) +#define TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC +#endif + +#if defined(TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC) +#define TCB_SPAN_ARRAY_CONSTEXPR constexpr +#else +#define TCB_SPAN_ARRAY_CONSTEXPR +#endif + +#ifdef TCB_SPAN_HAVE_STD_BYTE +using byte = std::byte; +#else +using byte = unsigned char; +#endif + +TCB_SPAN_INLINE_VAR constexpr std::ptrdiff_t dynamic_extent = -1; + +template +class span; + +namespace detail { + +template +struct span_storage { + constexpr span_storage() noexcept = default; + + constexpr span_storage(E* ptr, std::ptrdiff_t /*unused*/) noexcept + : ptr(ptr) + {} + + E* ptr = nullptr; + static constexpr std::ptrdiff_t size = S; +}; + +template +struct span_storage { + constexpr span_storage() noexcept = default; + + constexpr span_storage(E* ptr, std::size_t size) noexcept + : ptr(ptr), size(size) + {} + + E* ptr = nullptr; + std::size_t size = 0; +}; + +// Reimplementation of C++17 std::size() and std::data() +#if defined(TCB_SPAN_HAVE_CPP17) || \ + defined(__cpp_lib_nonmember_container_access) +using std::data; +using std::size; +#else +template +constexpr auto size(const C& c) -> decltype(c.size()) +{ + return c.size(); +} + +template +constexpr std::size_t size(const T (&)[N]) noexcept +{ + return N; +} + +template +constexpr auto data(C& c) -> decltype(c.data()) +{ + return c.data(); +} + +template +constexpr auto data(const C& c) -> decltype(c.data()) +{ + return c.data(); +} + +template +constexpr T* data(T (&array)[N]) noexcept +{ + return array; +} + +template +constexpr const E* data(std::initializer_list il) noexcept +{ + return il.begin(); +} +#endif // TCB_SPAN_HAVE_CPP17 + +#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_void_t) +using std::void_t; +#else +template +using void_t = void; +#endif + +template +using uncvref_t = + typename std::remove_cv::type>::type; + +template +struct is_span : std::false_type {}; + +template +struct is_span> : std::true_type {}; + +template +struct is_std_array : std::false_type {}; + +template +struct is_std_array> : std::true_type {}; + +template +struct has_size_and_data : std::false_type {}; + +template +struct has_size_and_data())), + decltype(detail::data(std::declval()))>> + : std::true_type {}; + +template > +struct is_container { + static constexpr bool value = + !is_span::value && !is_std_array::value && + !std::is_array::value && has_size_and_data::value; +}; + +template +using remove_pointer_t = typename std::remove_pointer::type; + +template +struct is_container_element_type_compatible : std::false_type {}; + +template +struct is_container_element_type_compatible< + T, E, void_t()))>> + : std::is_convertible< + remove_pointer_t()))> (*)[], + E (*)[]> {}; + +template +struct is_complete : std::false_type {}; + +template +struct is_complete : std::true_type {}; + +} // namespace detail + +template +class span { + static_assert(Extent == dynamic_extent || Extent >= 0, + "A span must have an extent greater than or equal to zero, " + "or a dynamic extent"); + static_assert(std::is_object::value, + "A span's ElementType must be an object type (not a " + "reference type or void)"); + static_assert(detail::is_complete::value, + "A span's ElementType must be a complete type (not a forward " + "declaration)"); + static_assert(!std::is_abstract::value, + "A span's ElementType cannot be an abstract class type"); + + using storage_type = detail::span_storage; + +public: + // constants and types + using element_type = ElementType; + using value_type = typename std::remove_cv::type; + using index_type = std::size_t; + using difference_type = std::ptrdiff_t; + using pointer = ElementType*; + using reference = ElementType&; + using iterator = pointer; + using const_iterator = const ElementType*; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + static constexpr index_type extent = static_cast(Extent); + + // [span.cons], span constructors, copy, assignment, and destructor + template ::type = 0> + constexpr span() noexcept + {} + + TCB_SPAN_CONSTEXPR11 span(pointer ptr, index_type count) + : storage_(ptr, count) + { + TCB_SPAN_EXPECT(extent == dynamic_extent || count == extent); + } + + TCB_SPAN_CONSTEXPR11 span(pointer first_elem, pointer last_elem) + : storage_(first_elem, last_elem - first_elem) + { + TCB_SPAN_EXPECT(extent == dynamic_extent || + last_elem - first_elem == extent); + } + + template < + std::size_t N, std::ptrdiff_t E = Extent, + typename std::enable_if< + (E == dynamic_extent || static_cast(N) == E) && + detail::is_container_element_type_compatible< + element_type (&)[N], ElementType>::value, + int>::type = 0> + constexpr span(element_type (&arr)[N]) noexcept : storage_(arr, N) + {} + + template < + std::size_t N, std::ptrdiff_t E = Extent, + typename std::enable_if< + (E == dynamic_extent || static_cast(N) == E) && + detail::is_container_element_type_compatible< + std::array&, ElementType>::value, + int>::type = 0> + TCB_SPAN_ARRAY_CONSTEXPR span(std::array& arr) noexcept + : storage_(arr.data(), N) + {} + + template < + std::size_t N, std::ptrdiff_t E = Extent, + typename std::enable_if< + (E == dynamic_extent || static_cast(N) == E) && + detail::is_container_element_type_compatible< + const std::array&, ElementType>::value, + int>::type = 0> + TCB_SPAN_ARRAY_CONSTEXPR span(const std::array& arr) noexcept + : storage_(arr.data(), N) + {} + + template ::value && + detail::is_container_element_type_compatible< + Container&, ElementType>::value, + int>::type = 0> + TCB_SPAN_CONSTEXPR11 span(Container& cont) + : storage_(detail::data(cont), detail::size(cont)) + { + TCB_SPAN_EXPECT(extent == dynamic_extent || + static_cast(detail::size(cont)) == + extent); + } + + template ::value && + detail::is_container_element_type_compatible< + const Container&, ElementType>::value, + int>::type = 0> + TCB_SPAN_CONSTEXPR11 span(const Container& cont) + : storage_(detail::data(cont), detail::size(cont)) + { + TCB_SPAN_EXPECT(extent == dynamic_extent || + static_cast(detail::size(cont)) == + extent); + } + + constexpr span(const span& other) noexcept = default; + + template ::value, + int>::type = 0> + constexpr span(const span& other) noexcept + : storage_(other.data(), other.size()) + {} + + ~span() noexcept = default; + + span& operator=(const span& other) noexcept = default; + + // [span.sub], span subviews + template + TCB_SPAN_CONSTEXPR11 span first() const + { + TCB_SPAN_EXPECT(Count >= 0 && Count <= size()); + return {data(), Count}; + } + + template + TCB_SPAN_CONSTEXPR11 span last() const + { + TCB_SPAN_EXPECT(Count >= 0 && Count <= size()); + return {data() + (size() - Count), Count}; + } + + template + using subspan_return_t = + span; + + template + TCB_SPAN_CONSTEXPR11 subspan_return_t subspan() const + { + TCB_SPAN_EXPECT((Offset >= 0 && Offset <= size()) && + (Count == dynamic_extent || + (Count >= 0 && Offset + Count <= size()))); + return {data() + Offset, + Count != dynamic_extent + ? Count + : (Extent != dynamic_extent ? Extent - Offset + : size() - Offset)}; + } + + TCB_SPAN_CONSTEXPR11 span + first(index_type count) const + { + TCB_SPAN_EXPECT(count >= 0 && count <= size()); + return {data(), count}; + } + + TCB_SPAN_CONSTEXPR11 span + last(index_type count) const + { + TCB_SPAN_EXPECT(count >= 0 && count <= size()); + return {data() + (size() - count), count}; + } + + TCB_SPAN_CONSTEXPR11 span + subspan(index_type offset, index_type count = static_cast(dynamic_extent)) const + { + TCB_SPAN_EXPECT((offset >= 0 && offset <= size()) && + (count == dynamic_extent || + (count >= 0 && offset + count <= size()))); + return {data() + offset, + count == dynamic_extent ? size() - offset : count}; + } + + // [span.obs], span observers + constexpr index_type size() const noexcept { return storage_.size; } + + constexpr index_type size_bytes() const noexcept + { + return size() * sizeof(element_type); + } + + constexpr bool empty() const noexcept { return size() == 0; } + + // [span.elem], span element access + TCB_SPAN_CONSTEXPR11 reference operator[](index_type idx) const + { + TCB_SPAN_EXPECT(idx >= 0 && idx < size()); + return *(data() + idx); + } + + /* Extension: not in P0122 */ +#ifndef TCB_SPAN_STD_COMPLIANT_MODE + TCB_SPAN_CONSTEXPR14 reference at(index_type idx) const + { +#ifndef TCB_SPAN_NO_EXCEPTIONS + if (idx < 0 || idx >= size()) { + char msgbuf[64] = { + 0, + }; + std::snprintf(msgbuf, sizeof(msgbuf), + "Index %td is out of range for span of size %td", idx, + size()); + throw std::out_of_range{msgbuf}; + } +#endif // TCB_SPAN_NO_EXCEPTIONS + return this->operator[](idx); + } + + TCB_SPAN_CONSTEXPR11 reference front() const + { + TCB_SPAN_EXPECT(!empty()); + return *data(); + } + + TCB_SPAN_CONSTEXPR11 reference back() const + { + TCB_SPAN_EXPECT(!empty()); + return *(data() + (size() - 1)); + } + +#endif // TCB_SPAN_STD_COMPLIANT_MODE + +#ifndef TCB_SPAN_NO_FUNCTION_CALL_OPERATOR + TCB_SPAN_DEPRECATED_FOR("Use operator[] instead") + constexpr reference operator()(index_type idx) const + { + return this->operator[](idx); + } +#endif // TCB_SPAN_NO_FUNCTION_CALL_OPERATOR + + constexpr pointer data() const noexcept { return storage_.ptr; } + + // [span.iterators], span iterator support + constexpr iterator begin() const noexcept { return data(); } + + constexpr iterator end() const noexcept { return data() + size(); } + + constexpr const_iterator cbegin() const noexcept { return begin(); } + + constexpr const_iterator cend() const noexcept { return end(); } + + TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rbegin() const noexcept + { + return reverse_iterator(end()); + } + + TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rend() const noexcept + { + return reverse_iterator(begin()); + } + + TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crbegin() const noexcept + { + return const_reverse_iterator(cend()); + } + + TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crend() const noexcept + { + return const_reverse_iterator(cbegin()); + } + +private: + storage_type storage_{}; +}; + +#ifdef TCB_SPAN_HAVE_DEDUCTION_GUIDES + +/* Deduction Guides */ +template +span(T (&)[N])->span; + +template +span(std::array&)->span; + +template +span(const std::array&)->span; + +template +span(Container&)->span; + +template +span(const Container&)->span; + +#endif // TCB_HAVE_DEDUCTION_GUIDES + +template +constexpr span +make_span(span s) noexcept +{ + return s; +} + +#define AS_SIGNED(N) static_cast(N) + +template +constexpr span make_span(T (&arr)[N]) noexcept +{ + return {arr}; +} + +template +TCB_SPAN_ARRAY_CONSTEXPR span make_span(std::array& arr) noexcept +{ + return {arr}; +} + +template +TCB_SPAN_ARRAY_CONSTEXPR span +make_span(const std::array& arr) noexcept +{ + return {arr}; +} + +#undef AS_SIGNED + +template +constexpr span make_span(Container& cont) +{ + return {cont}; +} + +template +constexpr span +make_span(const Container& cont) +{ + return {cont}; +} + +/* Comparison operators */ +// Implementation note: the implementations of == and < are equivalent to +// 4-legged std::equal and std::lexicographical_compare respectively + +template +TCB_SPAN_CONSTEXPR14 bool operator==(span lhs, span rhs) +{ + if (lhs.size() != rhs.size()) { + return false; + } + + for (std::ptrdiff_t i = 0; i < lhs.size(); i++) { + if (lhs[i] != rhs[i]) { + return false; + } + } + + return true; +} + +template +TCB_SPAN_CONSTEXPR14 bool operator!=(span lhs, span rhs) +{ + return !(lhs == rhs); +} + +template +TCB_SPAN_CONSTEXPR14 bool operator<(span lhs, span rhs) +{ + // No std::min to avoid dragging in + const std::ptrdiff_t size = + lhs.size() < rhs.size() ? lhs.size() : rhs.size(); + + for (std::ptrdiff_t i = 0; i < size; i++) { + if (lhs[i] < rhs[i]) { + return true; + } + if (lhs[i] > rhs[i]) { + return false; + } + } + return lhs.size() < rhs.size(); +} + +template +TCB_SPAN_CONSTEXPR14 bool operator<=(span lhs, span rhs) +{ + return !(rhs < lhs); +} + +template +TCB_SPAN_CONSTEXPR14 bool operator>(span lhs, span rhs) +{ + return rhs < lhs; +} + +template +TCB_SPAN_CONSTEXPR14 bool operator>=(span lhs, span rhs) +{ + return !(lhs < rhs); +} + +template +span(sizeof(ElementType)) * Extent))> +as_bytes(span s) noexcept +{ + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +template < + class ElementType, ptrdiff_t Extent, + typename std::enable_if::value, int>::type = 0> +span(sizeof(ElementType)) * Extent))> +as_writable_bytes(span s) noexcept +{ + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +/* Extension: nonmember subview operations */ + +#ifndef TCB_SPAN_STD_COMPLIANT_MODE + +template +TCB_SPAN_CONSTEXPR11 auto first(T& t) + -> decltype(make_span(t).template first()) +{ + return make_span(t).template first(); +} + +template +TCB_SPAN_CONSTEXPR11 auto last(T& t) + -> decltype(make_span(t).template last()) +{ + return make_span(t).template last(); +} + +template +TCB_SPAN_CONSTEXPR11 auto subspan(T& t) + -> decltype(make_span(t).template subspan()) +{ + return make_span(t).template subspan(); +} + +template +TCB_SPAN_CONSTEXPR11 auto first(T& t, std::ptrdiff_t count) + -> decltype(make_span(t).first(count)) +{ + return make_span(t).first(count); +} + +template +TCB_SPAN_CONSTEXPR11 auto last(T& t, std::ptrdiff_t count) + -> decltype(make_span(t).last(count)) +{ + return make_span(t).last(count); +} + +template +TCB_SPAN_CONSTEXPR11 auto subspan(T& t, std::ptrdiff_t offset, + std::ptrdiff_t count = dynamic_extent) + -> decltype(make_span(t).subspan(offset, count)) +{ + return make_span(t).subspan(offset, count); +} + +#endif // TCB_SPAN_STD_COMPLIANT_MODE + +} // namespace TCB_SPAN_NAMESPACE_NAME + +/* Extension: support for C++17 structured bindings */ + +#ifndef TCB_SPAN_STD_COMPLIANT_MODE + +namespace TCB_SPAN_NAMESPACE_NAME { + +template +constexpr auto get(span s) -> decltype(s[N]) +{ + return s[N]; +} + +} // namespace TCB_SPAN_NAMESPACE_NAME + +namespace std { + +template +class tuple_size> : public integral_constant(S)> {}; + +template +class tuple_size>; // not defined + +template +class tuple_element> { +public: + using type = E; +}; + +} // end namespace std + +#endif // TCB_SPAN_STD_COMPLIANT_MODE + +#endif // TCB_SPAN_HPP_INCLUDED