Goal: Recursively and generically convert advanced XML strings into JSON.
-
This function convertAdvXMLtoJSON shows how to convert advanced XML strings into JSON.
-
This function will handle items and constructs that convertXMLtoJSON will fail on 1) empty-element tags and 2) attributes.
<ADVELEM1 adv_attrA="adv_valA" /> <ADVELEM2 adv_attrA="adv_valA" adv_attrB="adv_valB"> <SUB> SUBDATA </SUB> </ADVELEM2>
-
Requires Eventing Storage (or metadata collection) and a "source" collection.
-
Will operate on any mutation where the KEY or meta.id starts with "xml:".
-
Will enrich the source document with a new JSON object representing the XML data.
-
Maintains a checksum to prevent the overhead of conversion if the property
in_xml
is unchanged.
-
convertAdvXMLtoJSON
-
Input Data/Mutation
-
Output Data/Mutation
// To run configure the settings for this Function, convertAdvXMLtoJSON, as follows:
//
// Version 7.1+
// "Function Scope"
// *.* (or try bulk.data if non-privileged)
// "Listen to Location"
// bulk.data.source
// "Eventing Storage"
// rr100.eventing.metadata
// Binding(s)
// 1. "binding type", "alias name...", "bucket.scope.collection", "Access"
// "bucket alias", "src_col", "bulk.data.source", "read and write"
//
// Version 6.X
// "Source Bucket"
// source
// "MetaData Bucket"
// metadata
// Binding(s)
// 1. "binding type", "alias name...", "bucket", "Access"
// "bucket alias", "src_col", "source", "read and write"
// ===============================================================
// THIS IS AN APPLICATION SPECIFIC CUSTOM TRANSFOMRATION ON NEED
function customParseFixups(key,tmp) {
// typically we would test the key for a name where we want to transform the value
// Example 1:
// if (key === "resultStr") {
// // makes an array of strings
// // var data_array = tmp.trim().split(' ');
// return data_array;
// }
// Example 2:
// if (key === "resultNumStr") {
// // makes an array of numbers
// // var data_array = tmp.trim().split(' ').map(str => Number(str));
// return data_array;
// }
// Do nothing this is a NOOP
return tmp;
}
// ===============================================================
function OnUpdate(doc, meta) {
// filter out non XML
if (!meta.id.startsWith("advxml:")) return;
// The KEY started with "advxml" try to process it
// ===========================================================
// *** Do other work required here on non .in_xml changes ***
// ===========================================================
// let's see if we need to re-create our json representation.
var xmlchksum = crc64(doc.in_xml);
// ===========================================================
// Don't reprocess if the doc.in_xml has not changed this could be
// a big performance win if the doc has other fields that mutate.
// We do this via a checksum of the .in_xml property.
if (doc.xmlchksum && doc.xmlchksum === xmlchksum) return;
// Either this is the first pass, or the .in_xml property changed.
var jsonDoc = parseAdvXmlToJson(doc.in_xml);
log(meta.id,"1. INPUT xml doc.in_xml :", doc.in_xml);
log(meta.id,"2. CHECKSUM doc.in_xml :", xmlchksum);
log(meta.id,"3. OUTPUT doc.out_json :", jsonDoc);
doc.out_json = jsonDoc;
doc.xmlchksum = xmlchksum;
// ===========================================================
// enrich the source collection with .out_json and .xmlchksum
src_col[meta.id] = doc;
}
// 7.0.0 version uses String.matchAll eliminates the need to make our own MatchAll function call as parseXmlToJson(xml)
function parseAdvXmlToJson(xml, recurs) {
const json = {};
if (!recurs) {
// 1st call, Fix bad closures, Transform Example : '<tagName attrName="attrValue" >' becomes '<tagName attrName="attrValue" />'
xml = xml.replace(/\"\s*>/g, '" />');
// 1st call, Transform Example : '<tagName attrName="attrValue" />' becomes '<tagName><attrName>attrValue</attrName></tagName>'
xml = xml.replace(/<\s*([^\/><\s]+)\s+(\w[^<>]*)(\s*\/>)/gm, '<$1 $2></$1>');
}
for (const res of xml.matchAll(/(?:<([\w:]*)(?:\s[^>]*)*>)((?:(?!<\1).)*)(?:<\/\1>)|<([\w:]*)(?:\s*)*\/>/gm)) {
// find all sest of 1..N attributes if any
var attrs = {};
for (const res1 of res[0].matchAll(/<\s*(\w[^\/><\s]*)\s+(\w[^<]*[^=<]\s*[=]\s*[\'\"][^<]+[\'\"])\s*>([^<>]*)</gm)) {
attrs[res1[1]] = {};
if (res1[3] !== "") {
// illegal XML: <tag1 attr1="aval1">value1</tag1>
log('Illegal can not have bare "' + res1[3] + '" value if we have attr(s) input:', res1[0]);
return null;
} else {
for (const res2 of res1[2].matchAll(/(\w+[^=<>]*)\s*[=]\s*[\"]([^\"]+)[\"]|(\w+[^=<>]*)\s*[=]\s*[\']([^\']+)[\']/gm)) {
if (res2[1] !== "") attrs[res1[1]][res2[1]] = res2[2];
if (res2[3] !== "") attrs[res1[1]][res2[3]] = res2[4];
}
}
}
const key = res[1] || res[3];
var value = res[2] && parseAdvXmlToJson(res[2], true);
if (res[2] === "" && Object.keys(attrs).length > 0) {
value = {};
}
if (attrs[key]) {
for (const p in attrs[key]) {
if (attrs[key][p]) {
value[p] = attrs[key][p];
}
}
}
attrs = {};
var tmp = ((value && Object.keys(value).length) ? value : res[2]) || null;
if (Array.isArray(json[key]) == false) {
if (json[key]) {
// we have seen this key before change from object to an array of objects
var old = json[key];
json[key] = [];
json[key].push(old);
json[key].push(tmp);
} else {
// link to a custom function
tmp = customParseFixups(key,tmp);
json[key] = tmp;
}
} else {
json[key].push(tmp);
}
}
return json;
}
/*
// need this for 6.6.0 version
function* MatchAll(str, regExp) {
if (!regExp.global) {
throw new TypeError('Flag /g must be set!');
}
const localCopy = new RegExp(regExp, regExp.flags);
let match;
while (match = localCopy.exec(str)) {
yield match;
}
}
// 6.6.0 version no String.matchAll need our own MatchAll function, call as parseXmlToJson(xml)
function parseAdvXmlToJson(xml, recurs) {
const json = {};
if (!recurs) {
// 1st call, Fix bad closures, Transform Example : '<tagName attrName="attrValue" >' becomes '<tagName attrName="attrValue" />'
xml = xml.replace(/\"\s*>/g, '" />');
// 1st call, Transform Example : '<tagName attrName="attrValue" />' becomes '<tagName attrName="attrValue"></tagName>'
xml = xml.replace(/<\s*([^\/><\s]+)\s+(\w[^<>]*)(\s*\/>)/gm, '<$1 $2></$1>');
}
for (const res of MatchAll(xml, /(?:<([\w:]*)(?:\s[^>]*)*>)((?:(?!<\1).)*)(?:<\/\1>)|<([\w:]*)(?:\s*)*\/>/gm)) {
// find all sest of 1..N attributes if any
var attrs = {};
for (const res1 of MatchAll(res[0], /<\s*(\w[^\/><\s]*)\s+(\w[^<]*[^=<]\s*[=]\s*[\'\"][^<]+[\'\"])\s*>([^<>]*)</gm)) {
attrs[res1[1]] = {};
if (res1[3] !== "") {
// illegal XML: <tag1 attr1="aval1">value1</tag1>
log('Illegal can not have bare "' + res1[3] + '" value if we have attr(s) input:', res1[0]);
return null;
} else {
for (const res2 of MatchAll(res1[2], /(\w+[^=<>]*)\s*[=]\s*[\"]([^\"]+)[\"]|(\w+[^=<>]*)\s*[=]\s*[\']([^\']+)[\']/gm)) {
if (res2[1] !== "") attrs[res1[1]][res2[1]] = res2[2];
if (res2[3] !== "") attrs[res1[1]][res2[3]] = res2[4];
}
}
}
const key = res[1] || res[3];
var value = res[2] && parseAdvXmlToJson(res[2], true);
if (res[2] === "" && Object.keys(attrs).length > 0) {
value = {};
}
if (attrs[key]) {
for (const p in attrs[key]) {
if (attrs[key][p]) {
value[p] = attrs[key][p];
}
}
}
attrs = {};
var tmp = ((value && Object.keys(value).length) ? value : res[2]) || null;
if (Array.isArray(json[key]) == false) {
if (json[key]) {
// we have seen this key before change from object to an array of objects
var old = json[key];
json[key] = [];
json[key].push(old);
json[key].push(tmp);
} else {
// link to a custom function
tmp = customParseFixups(key,tmp);
json[key] = tmp;
}
} else {
json[key].push(tmp);
}
}
return json;
}
*/
INPUT: KEY advxml::1
{
"type": "advxml",
"id": 1,
"in_xml": "<CD><ADVELEM1 adv_attrA=\"adv_valA\"/><ADVELEM2 adv_attrA=\"adv_valA\" adv_attrB=\"adv_valB\"><SUB>SUBDATA</SUB><TITLE>EmpireBurlesque</TITLE><ARTIST>BobDylan</ARTIST><COUNTRY>USA</COUNTRY><COMPANY>Columbia</COMPANY><PRICE>10.90</PRICE><YEAR>1985</YEAR></CD>"
}
UPDATED/OUTPUT: KEY advxml::1
{
"type": "advxml",
"id": 1,
"in_xml": "<CD><ADVELEM1 adv_attrA=\"adv_valA\"/><ADVELEM2 adv_attrA=\"adv_valA\" adv_attrB=\"adv_valB\"><SUB>SUBDATA</SUB></ADVELEM2><TITLE>EmpireBurlesque</TITLE><ARTIST>BobDylan</ARTIST><COUNTRY>USA</COUNTRY><COMPANY>Columbia</COMPANY><PRICE>10.90</PRICE><YEAR>1985</YEAR></CD>",
"out_json": {
"CD": {
"ADVELEM1": {
"adv_attrA": "adv_valA"
},
"ADVELEM2": {
"SUB": "SUBDATA",
"adv_attrA": "adv_valA",
"adv_attrB": "adv_valB"
},
"TITLE": "EmpireBurlesque",
"ARTIST": "BobDylan",
"COUNTRY": "USA",
"COMPANY": "Columbia",
"PRICE": "10.90",
"YEAR": "1985"
}
},
"xmlchksum": "99b252d9af646320"
}