commit 6c1d57f9cd8a1cf4c1d99ee25f7b4d7a97f50c6a Author: Dimitri Lozeve Date: Fri Nov 15 20:58:49 2024 +0100 Initial commit diff --git a/json.bqn b/json.bqn new file mode 100644 index 0000000..74ef0f3 --- /dev/null +++ b/json.bqn @@ -0,0 +1,227 @@ +# Part of bqn-libs: https://github.com/mlochbaum/bqn-libs +# 0-BSD License + +# JSON: JavaScript Object Notation +⟨ + Parse # JSON string to BQN + Export # BQN value to JSON (also Parse⁼) +⟩⇐ + +# JSON numbers, strings, and lists correspond directly to BQN +# Objects are represented as keys≍values +# true, false, null are represented as <"true", <"false", <"null" + +# An empty list exports as "" if its fill is a space and [] otherwise + +⟨Consts, ExportConst⟩ ← { + val ← <¨ name ← "true"‿"false"‿"null" + Consts ⇐ { + i ← name ⊐ 𝕩 + "Unknown constant" 𝕎 i = ≠name # 𝕎 formats errors + i ⊏ val + } + cm ← "Enclosed value must be JSON constant"∾∾' '⊸∾¨"or "⊸∾⌾(¯1⊸⊑)name + ExportConst ⇐ { + i ← val⊸⊐⌾< 𝕩 + cm ! i<≠val + i ⊑ name + } +} +⟨UnEscape, Escape⟩ ← { + in ← """\/bfrnt" + out ← (3↑in)∾@+8‿12‿13‿10‿9 + diff ← (out-in) ∾ 0 + Basic ← { + i ← in ⊐ 𝕩 + "Unknown escape" ! ∧´𝕨≤i<≠in + 𝕩 + 𝕨 × i ⊏ diff + } + + hc ← "0Aa" + hb ← ⥊hc+0≍˘10‿6‿6 # Hex boundaries, start and after-end + ho ← 2/hc-0‿10‿10 # Corresponding offsets + Hex ← { u 𝕊 𝕩: + d ← 𝕩 /˜ m ← ≠` (4⥊0)⊸»⊸≠ »u + t ← hb ⍋ d + "String \u must be followed by 4 hex characters" ! ∧´1=2|t + # Now m can't run past the end or self-intersect, + # or it would have hit a closing quote or backslash + v ← 16⊸×⊸+˜˝⌽ ⍉∘‿4⥊ d-t⊏ho + w‿e ← Surrogate v + ⟨(w+@-'u')⊸+⌾(u⊸/)𝕩, e⌾(u⊸/)m⟩ + } + sr ← 2⋆10 # Surrogate base/radix + sb ← sr×52+2+↕3 # Surrogate character boundaries + Surrogate ← { + c ← (≠sb)|sb⍋𝕩 # 0 for non-surrogate, 1 then 2 for surrogate + h ← 1=c # First half + "Unmatched surrogate pair" ! (0∾h) ≡ (2=c)∾0 + r ← 𝕩 - c⊏0∾sb # Numeric value of surrogates + v ← r + »h×sr×(2⋆6)+r + ⟨v, h⟩ + } + + UnEscape ⇐ { e 𝕊 𝕩: + u ← e ∧ 𝕩='u' + (u𝕩≥@+32 + xe‿dr ← e UnEscape 𝕩 # Escaped 𝕩; characters to drop + sg ← ((1-˜(s>q∨eo∨dr)×+`)∾+´)s∧q # Start at s∧q; exclude q, eo, dr + str ← sg ⊔ xe # Strings + + # Numbers and constants + b ← s ∨ 𝕩∊@+9‿10‿13‿32 # Whitespace (blank) + l ← ¬ b ∨ 𝕩∊"""{}[],:" # Word characters + w ← »⊸< l # Word starts + neg ← '-' = 𝕩 + dig ← ('0'≤𝕩) ∧ 𝕩≤'9' + m ← l ∧ (+`w)⊏0∾w/dig∨neg # Numbers + n ← m∧w ⋄ k ← m(»∧«)dig + CE ← {(! 𝕨 FE /⟜(/k))⍟(∨´) 𝕩} + cns ← ce Consts (1-˜(m »⊸=v∊"{:" + + # Keys + l ← (⍋g) ⊏ r # Container index + j ← +`⊸× o # Object index (start at 1; 0 if list) + keys ← ((q/(«c)×l⊏0∾j)∾1+´o) ⊔ strk + str ← ⊑keys + + # Purely numeric lists + l ⊏↩ ⍋⍋nm←0⌾⊑1(∾/∾˜)s(∨/⊣)u∊"]}""a" + nn ← +´¬nm + nl ← num⊔˜(1+´nm)∾˜0⊸<⊸×(1-nn)+(t='0')/l + jj‿jn ← 2↑(1↓nm)⊔j + Ob ← ⊑⟜keys⊸≍⍟(0<⊣) + n ← jn Ob¨ 1↓nl + + # Build collections + nv← n -˜○≠ vals←∾⟨cns,⊑nl,str,⌽n⟩ # Initial set of values + f ← (l2 cannot be exported to JSON" + ⟩ + ExportNumber # Type 1: number + !∘"Only numbers and arrays can be exported to JSON" +⟩ diff --git a/safetensors.bqn b/safetensors.bqn new file mode 100644 index 0000000..e453789 --- /dev/null +++ b/safetensors.bqn @@ -0,0 +1,103 @@ +⟨ExtractMetadata,GetArrayNames,GetArray,SerializeArrays⟩⇐ + +⟨Parse,Export⟩←•Import"json.bqn" + +ExtractHeader←{𝕊bytes: + n←2⊸×⊸+˜´⟨8‿'c',1‿'u'⟩•bit._cast 8↑bytes + ⟨Parse n↑8↓bytes,n+8⟩ +} + +JsonGet←{(⊑(⊏𝕨)⊐<𝕩)⊑1⊏𝕨} + +ExtractMetadata←{𝕊bytes: + header‿·←ExtractHeader bytes + header JsonGet⎊⟨⟩ "__metadata__" +} + +GetArrayNames←{𝕊bytes: + header‿·←ExtractHeader bytes + "__metadata__"⊸≢¨⊸/⊏header +} + +# Valid for sizes 8, 16, and 32 (passed as 𝕨) +ParseUint←{2⊸×⊸+˜´˘⟨8‿'c',1‿'u'⟩•bit._cast˘ ∘‿(𝕨÷8)⥊𝕩} +ParseInt←{(-2⋆𝕨-1)+(2⋆𝕨)|(2⋆𝕨-1)+𝕨ParseUint𝕩} + +ParseUint64←{(2⋆32)⊸×⊸+˜´˘ ∘‿2⥊32 ParseUInt 𝕩} +ParseInt64←{(2⋆32)⊸×⊸+˜´˘ ∘‿2⥊32 ParseInt 𝕩} + +# Parse a floating point number +# e is the size of the exponent part +ParseFloat←{e𝕊bytes: + n←⌽⟨8‿'c',1‿'u'⟩•bit._cast bytes + s←(≠n)-e+1 + sign←1+2×-⊑n + exponent←2⊸×⊸+˜´⌽e↑1↓n + significand←2⊸×⊸+˜´⌽1∾e↓1↓n + sign×(2⋆exponent-((2⋆e-1)-1))×significand÷2⋆s +} + +dtypes←⟨ + "BOOL", # Boolean type + "U8", # Unsigned byte + "I8", # Signed byte + "F8_E5M2", # FP8 + "F8_E4M3", # FP8 + "I16", # Signed integer (16-bit) + "U16", # Unsigned integer (16-bit) + "F16", # Half-precision floating point + "BF16", # Brain floating point + "I32", # Signed integer (32-bit) + "U32", # Unsigned integer (32-bit) + "F32", # Floating point (32-bit) + "F64", # Floating point (64-bit) + "I64", # Signed integer (64-bit) + "U64", # Unsigned integer (64-bit) +⟩ +typeConversions←⟨ + ⟨8‿'c', 1‿'u'⟩•bit._cast, # BOOL + 8⊸ParseUint, # U8 + ⟨8‿'c', 8‿'i'⟩•bit._cast, # I8 + 5⊸ParseFloat˘∘‿1⊸⥊, # F8_E5M2 + 4⊸ParseFloat˘∘‿1⊸⥊, # F8_E4M4 + ⟨8‿'c',16‿'i'⟩•bit._cast, # I16 + 16⊸ParseUint, # U16 + 5⊸ParseFloat˘∘‿2⊸⥊, # F16 + 8⊸ParseFloat˘∘‿2⊸⥊, # BF16 + ⟨8‿'c',32‿'i'⟩•bit._cast, # I32 + 32⊸ParseUint, # U32 + 8⊸ParseFloat˘∘‿4⊸⥊, # F32 + ⟨8‿'c',64‿'f'⟩•bit._cast, # F64 + ParseInt64, # I64 + ParseUint64, # U64 +⟩ + +GetArray←{bytes𝕊name: + header‿n←ExtractHeader bytes + byteBuf←n↓bytes + info←header JsonGet name + s‿e←info JsonGet "data_offsets" + shape←info JsonGet "shape" + dtypeIdx←⊑dtypes⊐SerializeArray¨arrs + dataOffsets←<˘2↕0∾+`≠¨datas + blocks←{𝕊name‿dtype‿shape‿dataOffset: + ["dtype"‿"shape"‿"data_offsets",dtype‿shape‿dataOffset] + }¨<˘⍉>names‿dtypes‿shapes‿dataOffsets + header←[names,blocks] + n←≠headerJson←Export header + nEncoded←⟨32‿'i',8‿'c'⟩•bit._cast⟨n,0⟩ + nEncoded∾headerJson∾∾datas +}