Function: convertAdvXMLtoJSON

  • Capella Operational
      +

      Goal: Recursively and generically convert advanced XML strings into JSON.

      • This function convertAdvXMLtoJSON shows how to convert advanced XML strings into JSON.

      • This function will handle items and constructs that convertXMLtoJSON will fail on 1) empty-element tags and 2) attributes.

        <ADVELEM1 adv_attrA="adv_valA" />
        
        <ADVELEM2 adv_attrA="adv_valA" adv_attrB="adv_valB">
            <SUB>
                SUBDATA
            </SUB>
        </ADVELEM2>
      • Requires Eventing Storage (or metadata collection) and a "source" collection.

      • Will operate on any mutation where the KEY or meta.id starts with "xml:".

      • Will enrich the source document with a new JSON object representing the XML data.

      • Maintains a checksum to prevent the overhead of conversion if the property in_xml is unchanged.

      • convertAdvXMLtoJSON

      • Input Data/Mutation

      • Output Data/Mutation

      // To run configure the settings for this Function, convertAdvXMLtoJSON, as follows:
      //
      // Version 7.1+
      //   "Function Scope"
      //     *.* (or try bulk.data if non-privileged)
      //   "Listen to Location"
      //     bulk.data.source
      //   "Eventing Storage"
      //     rr100.eventing.metadata
      //   Binding(s)
      //    1. "binding type", "alias name...", "bucket.scope.collection", "Access"
      //       "bucket alias", "src_col",       "bulk.data.source",        "read and write"
      //
      // Version 6.X
      //   "Source Bucket"
      //     source
      //   "MetaData Bucket"
      //     metadata
      //   Binding(s)
      //    1. "binding type", "alias name...", "bucket",     "Access"
      //       "bucket alias", "src_col",       "source",     "read and write"
      
      // ===============================================================
      // THIS IS AN APPLICATION SPECIFIC CUSTOM TRANSFOMRATION ON NEED
      function customParseFixups(key,tmp) {
          // typically we would test the key for a name where we want to transform the value
      
          // Example 1:
          //     if (key === "resultStr") {
          //        // makes an array of strings
          //        // var data_array = tmp.trim().split(' ');
          //        return data_array;
          //     }
      
          // Example 2:
          //     if (key === "resultNumStr") {
          //        // makes an array of numbers
          //        // var data_array = tmp.trim().split(' ').map(str => Number(str));
          //        return data_array;
          //     }
      
          // Do nothing this is a NOOP
          return tmp;
      }
      // ===============================================================
      
      function OnUpdate(doc, meta) {
          // filter out non XML
          if (!meta.id.startsWith("advxml:")) return;
          // The KEY started with "advxml" try to process it
          // ===========================================================
          // *** Do other  work required here on non .in_xml changes ***
          // ===========================================================
          // let's see if we need to re-create our json representation.
          var xmlchksum = crc64(doc.in_xml);
          // ===========================================================
          // Don't reprocess if the doc.in_xml has not changed this could be
          // a big performance win if the doc has other fields that mutate.
          // We do this via a checksum of the .in_xml property.
          if (doc.xmlchksum && doc.xmlchksum === xmlchksum) return;
          // Either this is the first pass, or the .in_xml property changed.
          var jsonDoc = parseAdvXmlToJson(doc.in_xml);
          log(meta.id,"1. INPUT  xml doc.in_xml :", doc.in_xml);
          log(meta.id,"2. CHECKSUM doc.in_xml   :", xmlchksum);
          log(meta.id,"3. OUTPUT doc.out_json   :", jsonDoc);
          doc.out_json = jsonDoc;
          doc.xmlchksum = xmlchksum;
          // ===========================================================
          // enrich the source collection with .out_json and .xmlchksum
          src_col[meta.id] = doc;
      }
      
      // 7.0.0 version uses String.matchAll eliminates the need to make our own MatchAll function call as parseXmlToJson(xml)
      function parseAdvXmlToJson(xml, recurs) {
      	const json = {};
      	if (!recurs) {
      		// 1st call, Fix bad closures,  Transform Example : '<tagName attrName="attrValue" >'  becomes '<tagName attrName="attrValue" />'
      		xml = xml.replace(/\"\s*>/g, '" />');
      		// 1st call, Transform Example : '<tagName attrName="attrValue" />' becomes '<tagName><attrName>attrValue</attrName></tagName>'
      		xml = xml.replace(/<\s*([^\/><\s]+)\s+(\w[^<>]*)(\s*\/>)/gm, '<$1 $2></$1>');
      	}
      	for (const res of xml.matchAll(/(?:<([\w:]*)(?:\s[^>]*)*>)((?:(?!<\1).)*)(?:<\/\1>)|<([\w:]*)(?:\s*)*\/>/gm)) {
      		// find all sest of 1..N attributes if any
      		var attrs = {};
      		for (const res1 of res[0].matchAll(/<\s*(\w[^\/><\s]*)\s+(\w[^<]*[^=<]\s*[=]\s*[\'\"][^<]+[\'\"])\s*>([^<>]*)</gm)) {
      			attrs[res1[1]] = {};
      			if (res1[3] !== "") {
      				// illegal XML: <tag1 attr1="aval1">value1</tag1>
      				log('Illegal can not have bare "' + res1[3] + '" value if we have attr(s) input:', res1[0]);
      				return null;
      			} else {
      				for (const res2 of res1[2].matchAll(/(\w+[^=<>]*)\s*[=]\s*[\"]([^\"]+)[\"]|(\w+[^=<>]*)\s*[=]\s*[\']([^\']+)[\']/gm)) {
      					if (res2[1] !== "") attrs[res1[1]][res2[1]] = res2[2];
      					if (res2[3] !== "") attrs[res1[1]][res2[3]] = res2[4];
      				}
      			}
      		}
      		const key = res[1] || res[3];
      		var value = res[2] && parseAdvXmlToJson(res[2], true);
      		if (res[2] === "" && Object.keys(attrs).length > 0) {
      			value = {};
      		}
      		if (attrs[key]) {
      			for (const p in attrs[key]) {
      				if (attrs[key][p]) {
      					value[p] = attrs[key][p];
      				}
      			}
      		}
      		attrs = {};
      		var tmp = ((value && Object.keys(value).length) ? value : res[2]) || null;
      		if (Array.isArray(json[key]) == false) {
      			if (json[key]) {
      				// we have seen this key before change from object to an array of objects
                      var old = json[key];
                      json[key] = [];
                      json[key].push(old);
                      json[key].push(tmp);
      			} else {
      			    // link to a custom function
      			    tmp = customParseFixups(key,tmp);
      				json[key] = tmp;
      			}
      		} else {
      			json[key].push(tmp);
      		}
      	}
      	return json;
      }
      
      /*
      // need this for 6.6.0 version
      function* MatchAll(str, regExp) {
        if (!regExp.global) {
          throw new TypeError('Flag /g must be set!');
        }
        const localCopy = new RegExp(regExp, regExp.flags);
        let match;
        while (match = localCopy.exec(str)) {
          yield match;
        }
      }
      
      // 6.6.0 version no String.matchAll need our own MatchAll function, call as parseXmlToJson(xml)
      function parseAdvXmlToJson(xml, recurs) {
      	const json = {};
      	if (!recurs) {
      		// 1st call, Fix bad closures,  Transform Example : '<tagName attrName="attrValue" >'  becomes '<tagName attrName="attrValue" />'
      		xml = xml.replace(/\"\s*>/g, '" />');
      		// 1st call, Transform Example : '<tagName attrName="attrValue" />' becomes '<tagName attrName="attrValue"></tagName>'
      		xml = xml.replace(/<\s*([^\/><\s]+)\s+(\w[^<>]*)(\s*\/>)/gm, '<$1 $2></$1>');
      	}
      	for (const res of MatchAll(xml, /(?:<([\w:]*)(?:\s[^>]*)*>)((?:(?!<\1).)*)(?:<\/\1>)|<([\w:]*)(?:\s*)*\/>/gm)) {
      		// find all sest of 1..N attributes if any
      		var attrs = {};
      		for (const res1 of MatchAll(res[0], /<\s*(\w[^\/><\s]*)\s+(\w[^<]*[^=<]\s*[=]\s*[\'\"][^<]+[\'\"])\s*>([^<>]*)</gm)) {
      			attrs[res1[1]] = {};
      			if (res1[3] !== "") {
      				// illegal XML: <tag1 attr1="aval1">value1</tag1>
      				log('Illegal can not have bare "' + res1[3] + '" value if we have attr(s) input:', res1[0]);
      				return null;
      			} else {
      				for (const res2 of MatchAll(res1[2], /(\w+[^=<>]*)\s*[=]\s*[\"]([^\"]+)[\"]|(\w+[^=<>]*)\s*[=]\s*[\']([^\']+)[\']/gm)) {
      					if (res2[1] !== "") attrs[res1[1]][res2[1]] = res2[2];
      					if (res2[3] !== "") attrs[res1[1]][res2[3]] = res2[4];
      				}
      			}
      		}
      		const key = res[1] || res[3];
      		var value = res[2] && parseAdvXmlToJson(res[2], true);
      		if (res[2] === "" && Object.keys(attrs).length > 0) {
      			value = {};
      		}
      		if (attrs[key]) {
      			for (const p in attrs[key]) {
      				if (attrs[key][p]) {
      					value[p] = attrs[key][p];
      				}
      			}
      		}
      		attrs = {};
      		var tmp = ((value && Object.keys(value).length) ? value : res[2]) || null;
      		if (Array.isArray(json[key]) == false) {
      			if (json[key]) {
      				// we have seen this key before change from object to an array of objects
                      var old = json[key];
                      json[key] = [];
                      json[key].push(old);
                      json[key].push(tmp);
      			} else {
      			    // link to a custom function
      			    tmp = customParseFixups(key,tmp);
      				json[key] = tmp;
      			}
      		} else {
      			json[key].push(tmp);
      		}
      	}
      	return json;
      }
      */
      INPUT: KEY advxml::1
      
      {
        "type": "advxml",
        "id": 1,
        "in_xml": "<CD><ADVELEM1 adv_attrA=\"adv_valA\"/><ADVELEM2 adv_attrA=\"adv_valA\" adv_attrB=\"adv_valB\"><SUB>SUBDATA</SUB><TITLE>EmpireBurlesque</TITLE><ARTIST>BobDylan</ARTIST><COUNTRY>USA</COUNTRY><COMPANY>Columbia</COMPANY><PRICE>10.90</PRICE><YEAR>1985</YEAR></CD>"
      }
      UPDATED/OUTPUT: KEY advxml::1
      
      {
        "type": "advxml",
        "id": 1,
        "in_xml": "<CD><ADVELEM1 adv_attrA=\"adv_valA\"/><ADVELEM2 adv_attrA=\"adv_valA\" adv_attrB=\"adv_valB\"><SUB>SUBDATA</SUB></ADVELEM2><TITLE>EmpireBurlesque</TITLE><ARTIST>BobDylan</ARTIST><COUNTRY>USA</COUNTRY><COMPANY>Columbia</COMPANY><PRICE>10.90</PRICE><YEAR>1985</YEAR></CD>",
        "out_json": {
          "CD": {
            "ADVELEM1": {
              "adv_attrA": "adv_valA"
            },
            "ADVELEM2": {
              "SUB": "SUBDATA",
              "adv_attrA": "adv_valA",
              "adv_attrB": "adv_valB"
            },
            "TITLE": "EmpireBurlesque",
            "ARTIST": "BobDylan",
            "COUNTRY": "USA",
            "COMPANY": "Columbia",
            "PRICE": "10.90",
            "YEAR": "1985"
          }
        },
        "xmlchksum": "99b252d9af646320"
      }