Tag Archives: data-format

@tuple – experimenting with lisp-like syntax as XML replacement

Motivation

The following post describes an experimental format which has a lisp-like syntax, with hints of JSON, for storing tree-structured data as text. The main motivation was a need to have a less verbose alternative to XML, but still be readable and editable, support pattern matching and transformation, include common binary datatypes, and enable structural type checking.

The current specification might contain errors and is not yet completed. Rule-based transformations through pattern matching is still in early stage.

Current specification

(@text
  (#
 
  Quick overview
    @tuple is an experimental typed data format using s-expression-like syntax
    for storing tree-structured data. In addition to handling text there is also
    support for common binary datatypes. The data format also allows transformations
    with rules that pattern match on the tree structure.
 
    A tuple is given by (<type> <exp1> <exp2> ... <expn> <type>) or its shorthand
    form (<type> <exp1> <exp2> ... <expn>). The sequence of expressions allowed
    is dependent on the type.
 
    A typed tuple may be a @int, @float, @bool, @bytes, @rule, @text, @note or
    @tuple. A named or untyped tuple is implicitly typed as @tuple. A @note
    is used for documentation or ignoring embedded tuples, and may be
    discarded during parsing.
 
    Binary operators are defined for
      :  key-value pair in @tuple or @note
      == require rule in @rule
      -> replace rule in @rule
      => merge rule in @rule
 
    A @tuple may contain key-value pairs, @note, @rule, untyped and typed @text, and
    untyped and typed @tuple.
 
    The value in a key-value pair may contain untyped or typed @text, @int, @float,
    @bool, @bytes, @tuple. It may also contain a pattern matching variable.
 
    A document must start with a named @tuple or typed tuple, but cannot start with
    a @note.
 
  Syntax
    ()    encloses a named, unnamed or typed tuple
 
    (#    begins a text block (nested text blocks are allowed but
          unbalanced text blocks must use double-quotation instead)
    #)    ends a text block
    ""    encloses a text-string of escaped characters
    @text gives one or more text-blocks and text-strings
 
    :  a pair where left side is key and right side the value
    => rule that matches the left side and then merges it with the right side
    -> rule that matches the left side and then replace it with the right side
    == rule that matches left side and then require that the right side is present
    '' encloses a pattern matching variable that specify type or named reference
 
    @tuple  gives a sequence of typed or untyped @tuple, @text, @rule, or @note
    ()      gives a sequence of typed or untyped @tuple, @text, @rule, or @note
    (Name)  gives a sequence of typed or untyped @tuple, @text, @rule, or @note
    @note   gives a sequence of typed or untyped @tuple, @text, @rule, or @note
    @rule   gives a sequence of rules
    @bytes  gives a sequence of base64-encoded text containing unsigned bytes
    @int    gives a sequence of signed integers in :32 bit or :64 bit
    @float  gives a sequence of floating point numbers in :32 bit or :64 bit
    @value  gives a sequence of unbounded numeric values
    @bool   gives a sequence of true and false values
 
  Pattern matching
    * Application of pattern matching is handled in application code
    * Distinction between shallow and deep pattern matching
      (@note "a @rule:shallow pattern matches on child nodes of @tuple"
        (@rule:shallow (value: '@int') => (Data))
      )
      (@note "a @rule:deep pattern matches recursively on child nodes of @tuple and pairs"
        (@rule:deep (value: '@int') => (Data))
      )
      (@note "a @rule is implicitly typed as @rule:shallow"
        (@rule (value: '@int') => (Data))
      )
    * Pattern matching variables can be used to
      (@note "assign variable to the value of a pair"
        (Data value: 'pi')
      )
      (@note "replace a variable with an expression"
        (@rule:deep 'pi' -> (@float 3.14))
      )
      (@note "require the structure of a tuple"
        (@rule (Data) == (size: '@int'))
      )
 
  Examples
    (@note "named tuples"
       (Document title: "just another format" Document)
       (Dir name: "root" files: (
           (File name: "readme" data: (@bytes (#MTIz#)))
         )
       )
    )
    (@note "unnamed tuples"
       (title: "just another format")
       (name: "root" files: (
           (name: "readme" data: (@bytes (#MTIz#)))
         )
       )
    )
    (@note "typed tuples in key-value pairs"
       (Cell name: "a" value: (@int:32 42))
       (Screen fullscreen: (@bool false))
    )
    (@note "untyped tuples in key-value pairs"
       (Cell name: "b" value: 42)
       (Screen fullscreen: false)
    )
 
  Parsing syntax
    start            ::= tuple-named | tuple-typed | rule-typed | primitives-typed
 
    tuple            ::= tuple-named | tuple-untyped | tuple-typed
    tuple-typed      ::= typed<tuple-type,tuple-seq>
    tuple-untyped    ::= unnamed<tuple-seq>
    tuple-type       ::= '@tuple'
    tuple-named      ::= named<name,tuple-seq>
    tuple-seq        ::= {rule | note | text | pair | tuple, pad}
 
    unnamed<body>    ::= '(' body ')'
    named<tag,body>  ::= '(' tag (pad body)? (pad tag)? ')'
    typed<tag,body>  ::= '(' tag (pad body)? (pad tag)? ')'
    name             ::= [a-zA-Z][-A-Za-z0-9]*
    pair             ::= name ':' pad pair-value
    pair-value       ::= values | variable | tuple
    values           ::= values-untyped | values-typed
    values-untyped   ::= value-untyped | bool-untyped | text-untyped
    values-typed     ::= bytes-typed | float-typed | int-typed
                       | bool-typed | text-typed | tuple
 
    rule             ::= rule-typed
    rule-typed       ::= typed<rule-type,rule-seq>
    rule-type        ::= '@rule' (':shallow' | ':deep')?
    rule-seq         ::= {rule-merge | rule-replace | rule-require | note, pad}
    rule-merge       ::= tuple pad '=>' pad tuple
    rule-replace     ::= (tuple pad '->' pad tuple)
                       | variable pad '->' pad (tuple | primitives)
    rule-require     ::= tuple pad '==' pad tuple
 
    variable         ::= ''' variable-type '''
    variable-type    ::= name | int-type | float-type | bool-type | bytes-type 
                       | text-type | tuple-type | note-type
 
    note             ::= note-typed
    note-typed       ::= typed<note-type,tuple-seq>
    note-type        ::= '@note'
 
    bytes            ::= bytes-typed
    bytes-typed      ::= typed<bytes-type,bytes-untyped>
    bytes-untyped    ::= '(#' `all validated base64 text with whitespace trimmed` '#)'
    bytes-type       ::= '@bytes'
 
    value-untyped    ::= int-untyped | float-untyped
 
    int              ::= int-untyped | int-typed
    int-typed        ::= typed<int-type,int-seq>
    int-untyped      ::= `characters giving an integer of any size`
    int-type         ::= '@int' (':32' | ':64')?
    int-seq          ::= {int-untyped, pad}
 
    float            ::= float-untyped | float-typed
    float-typed      ::= typed<float-type,float-seq>
    float-untyped    ::= `characters giving a floating point of any size`
    float-type       ::= '@float' (':32' | ':64')?
    float-seq        ::= {float-untyped, pad}
 
    bool             ::= bool-untyped | bool-typed
    bool-typed       ::= typed<bool-type,bool-seq>
    bool-untyped     ::= 'true' | 'false'
    bool-type        ::= '@bool'
    bool-seq         ::= {bool-untyped, pad}
 
    text             ::= text-untyped | text-typed
    text-typed       ::= typed<text-type,text-seq>
    text-untyped     ::= text-string | text-block
    text-type        ::= '@text'
    text-seq         ::= {text-string | text-block, pad}
    text-string      ::= '"' `all validated text until unescaped double-quote` '"'
    text-block       ::= '(#' `all validated text until text-block is balanced` '#)'
 
    pad              ::= [/s]+
 
  How to handle text
    * Text-strings are escaped by replacing '\' with '\\' then '"' with '\"', and
      unescaped by replacing '\"' with '"' then '\\' with '\'.
    * Text-blocks are not escaped, but verified that they are balanced. If not they
      become text-strings. A text-block is balanced if all embedded '(#' is matched
      by a corresponding '#)'. A text-block cannot end with the '(' character.
 
  Examples of invalid expressions
    (@note "duplicate keys"
      (Data name: "A" name: "B")
    )
    (@note "named tuple not enclosed with parenthesis"
      Data
    )
    (@note "type not enclosed with parenthesis"
      @tuple
    )
    (@note "variable does not contain a valid type"
      value: '@byte'
    )
 
  Examples of valid expressions
    (@note "untyped @tuple"
      (label: "Kyrre")
    )
    (@note "untyped @text"
      "Some characters!"
    )
    (@note "boolean value is implicit typed as @bool"
      value: true
    )
    (@note "numeric value implicit typed as @value"
      width: 1024
    )
    (@note "integer value implicit typed as @int:64"
      value: (@int 4)
    )
    (@note "float value implicit typed as @float:64"
      value: (@float 3.14)
    )
  #)
@text)

An example

Data with redundant information stored in rules:

(@tuple
 
  (@note "Rules for naming all unnamed tuples and discarding notes and rules")
  (@rule:deep
    (name: '@text' type: '@text') => (Node)
    (from: '@tuple' to: '@tuple') => (Link)
    (inputs: (
        (@rule (name: '@text' type: '@text') => (Socket))
      )
    ) => ()
    (outputs: (
        (@rule (name: '@text' type: '@text') => (Socket))
      )
    ) => ()
    '@note' -> ()
    '@rule' -> ()
  )
 
  (@note "Rules for replacing variables with data")
  (@rule:deep
    'binary-data' -> (@bytes (#QUI9PQ==#))
    'instructions' -> "some data in text format"
  )
 
  (@note "The data to be transformed")
  (name: "Group1" type: "Group"
    nodes: (
      (name: "Source" type: "Value" value: 'binary-data'
        inputs: ((name: "In" type: "bytedata"))
        outputs: ((name: "Out" type: "bytedata"))
      )
      (name: "Transform" type: "Process" data: 'instructions'
        inputs: ((name: "In" type: "bytedata"))
        outputs: ((name: "Out" type: "bytedata"))
      )
      (name: "Target" type: "Value" value: ""
        inputs: ((name: "In" type: "bytedata"))
        outputs: ((name: "Out" type: "bytedata"))
      )
    )
    links: (
      (from: (node: "Source" socket: "Out") to: (node: "Transform" socket: "In"))
      (from: (node: "Transform" socket: "Out") to: (node: "Target" socket: "In"))
    )
  )
 
@tuple)

The resulting data after the rules of the tuple has been applied to itself:

(@tuple
  (Node name: "Group1" type: "Group"
    nodes: (
      (Node name: "Source" type: "Value" value: (@bytes (#QUI9PQ==#))
        inputs: ((Socket name: "In" type: "bytedata"))
        outputs: ((Socket name: "Out" type: "bytedata"))
      )
      (Node name: "Transform" type: "Process" data: "some data in text format"
        inputs: ((Socket name: "In" type: "bytedata"))
        outputs: ((Socket name: "Out" type: "bytedata"))
      )
      (Node name: "Target" type: "Value" value: "output"
        inputs: ((Socket name: "In" type: "bytedata"))
        outputs: ((Socket name: "Out" type: "bytedata"))
      )
    )
 
    links: (
      (Link from: (node: "Source" socket: "Out")
        to: (node: "Transform" socket: "In")
      )
      (Link from: (node: "Transform" socket: "Out")
        to: (node: "Target" socket: "In")
      )
    )
  )
@tuple)

Implementation

A basic implementation in Scala is available at bitbucket.org/trondolsen/tuples and also an api documentation.

Ideas for extensions

  • Chaining together separate files
  • Nesting separate files with pattern matching variables – like referencing large bytedata and instantiating tuples
  • Use separate files for validation, update and typing of data
  • Pattern matching bits, will maybe complicate the format too much

Simple serialization with XML strings in Javascript

Introduction

Included in this post are some quick and simple functions for doing XML serialization in Javascript through strings. Reserved XML characters < > & ' " are escaped from input and if any invalid input are found an exception will be thrown.

List of XML utility functions:

  • toXmlHeader() – Creates the XML header.
  • toXmlElem(name: string, attributes: object) – Creates and closes an XML element.
  • toXmlElemOpen(name: string, attributes: object) – Creates and opens an XML element.
  • toXmlElemClose(name: string) – Closes an XML element.
  • toXmlText(text: string) – Creates XML text.

A serialization example

var xml = toXmlHeader();
xml += toXmlElemOpen("Library"); // No attributes given.
 
xml += toXmlElemOpen("Authors", {country: "Norway"});
xml += toXmlElem("Author", {name:"Petter Dass"});
xml += toXmlElemClose("Authors");
 
xml += toXmlElemOpen("Books"); // No attributes given.
xml += toXmlElem("Book", {title:"book1", author:"author1"});
xml += toXmlElemOpen("Book", {title:"book2", author:"author1"});
xml += toXmlText("Ach, So?");
xml += toXmlElemClose("Book");
xml += toXmlElem("Book", {title:"book3", author:"author2"});
xml += toXmlElem("Book"); // No attributes given.
xml += toXmlElemClose("Books");
 
xml += toXmlElemClose("Library");
alert(xml); // the complete xml is now contained inside the string.
<?xml version="1.0" encoding="UTF-8"?>
<Library>
<Authors country="Norway">
<Author name="Petter Dass"/></Authors>
<Books>
<Book title="book1" author="author1"/>
<Book title="book2" author="author1">Ach, So?</Book>
<Book title="book3" author="author2"/>
<Book/></Books></Library>

Sourcecode for the XML utility functions

function toXmlValid(str) {
  if (str === undefined || str === null || str.match === undefined || str.match(/<|>|&|'|"/) !== null) {
    throw("invalid string given");
  }
 
  return str;
}
 
function escapeXmlText(str) {
  if (str === undefined || str === null || str.replace === undefined) {
    throw("invalid string given");
  }
 
  // The order of replace is important because the & character is used in escaping.
  return str.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/'/g, "&apos;");
}
 
function unescapeXmlText(str) {
  if (str === undefined || str === null || str.replace === undefined) {
    throw("invalid string given");
  }
 
  // The order of replace is important because the & character is used in unescaping.
  return str.replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&apos;/g, "'").replace(/&quot;/g, "\"").replace(/&amp;/g, "&");
}
 
function toXmlHeader() {
  return "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
}
 
function toXmlAttr(name, value) {
  return " " + toXmlValid(name) + "=\"" + escapeXmlText(value) + "\"";
}
 
function toXmlElem(tagName, attributes) {
  var str = "\n<" + toXmlValid(tagName);
 
  if (attributes !== undefined) {	
    for (var key in attributes) {
      if (attributes.hasOwnProperty(key) === true) {
        str += toXmlAttr(key, attributes[key]);
      }
    }
  }
 
  return str + "/>";
}
 
function toXmlElemOpen(tagName, attributes) {
  var str = "\n<" + toXmlValid(tagName);
 
  if (attributes !== undefined) {
    for (var key in attributes) {
      if (attributes.hasOwnProperty(key) === true) {
        str += toXmlAttr(key, attributes[key]);
      }
    }
  }
 
  return str + ">";
}
 
function toXmlElemClose(tagName) {
  return "</" + toXmlValid(tagName) + ">";
}
 
function toXmlText(text) {
  return escapeXmlText(text);
}

References