SQLizer Logo

Easily convert files into SQL databases

Converting JSON and XML Files Using SQLizer

JSON and XML files aren't naturally organized into rows and columns like CSV files, spreadsheets and database tables. In order to convert them into database tables, SQLizer needs to flatten them into tabular form. This page describes that process in more detail.

JSON Files

With JSON data, properties on an object are converted into columns and entries in an array are treated as new rows. If two sibling objects have different properties, the columns on the resulting table are a combination of all the properties seen at that level. For example:

[
    {
        "val1": "foo",
        "val2": 3
    },
    {
        "val1": "bar",
        "val2": 15,
        "val3": "2020-01-01"
    }
]

SQLizer converts the above into the following SQL:

CREATE TABLE IF NOT EXISTS example (
    `list_val1` VARCHAR(3) CHARACTER SET utf8,
    `list_val2` INT,
    `list_val3` DATETIME
);
INSERT INTO example VALUES
    ('foo',3,NULL),
    ('bar',15,'2020-01-01 00:00:00');
IF NOT EXISTS (
    select * from sysobjects where name='example' and xtype='U'
) CREATE TABLE example (
    [list_val1] NVARCHAR(3),
    [list_val2] INT,
    [list_val3] DATETIME
);
INSERT INTO example VALUES
    (N'foo',3,NULL),
    (N'bar',15,'2020-01-01 00:00:00');
CREATE TABLE IF NOT EXISTS "example" (
    "list_val1" TEXT,
    "list_val2" INT,
    "list_val3" TIMESTAMP
);
INSERT INTO "example" VALUES
    ('foo',3,NULL),
    ('bar',15,'2020-01-01 00:00:00');

The resulting table gets "val1", "val2" and "val3" columns.

Sometimes JSON objects are structured with lots of properties, and its obvious that they're indended to represent a list of entities. In this case, SQLizer treats those properties as if they were rows in a list. For example:

{
    "00001": { "prop1": 1, "prop2": 2 },
    "00002": { "prop1": 1, "prop2": 2 },
    "00003": { "prop1": 1, "prop2": 2 },
    ... lots more rows like this ...
    "99998": { "prop1": 1, "prop2": 2 },
    "99999": { "prop1": 1, "prop2": 2 }
}

If SQLizer finds more than 50 properties on an object, it will treat those properties like an array. So the SQL output for the example above would be:

CREATE TABLE IF NOT EXISTS lotsofproperties (
    `properties_key` INT,
    `properties_properties_prop1` INT,
    `properties_properties_prop2` INT
);
INSERT INTO lotsofproperties VALUES
    (00001,1,2),
    (00002,1,2),
    (00003,1,2),
    ...
    (99998,1,2),
    (99999,1,2);
IF NOT EXISTS (
    select * from sysobjects where name='lotsofproperties' and xtype='U'
) CREATE TABLE lotsofproperties (
    [properties_key] INT,
    [properties_properties_prop1] INT,
    [properties_properties_prop2] INT
);
INSERT INTO lotsofproperties VALUES
    (00001,1,2),
    (00002,1,2),
    (00003,1,2),
    ...
    (99998,1,2),
    (99999,1,2);
CREATE TABLE IF NOT EXISTS "lotsofproperties" (
    "properties_key" INT,
    "properties_properties_prop1" INT,
    "properties_properties_prop2" INT
);
INSERT INTO "lotsofproperties" VALUES
    (00001,1,2),
    (00002,1,2),
    (00003,1,2),
    ...
    (99998,1,2),
    (99999,1,2);

Sometimes JSON files contain nested entities with multiple levels of data. In that case SQLizer will work its way down the structure until it finds an entity with no children. Each of those entities will then be converted into a row, and all of the parent entities' properties will be copied onto that row. For example:

{
    "ID": "1",
    "Description": "top level description",
    "Orders": [
        {
            "ID": 1,
            "Date": "2020-01-02"
        },
        {
            "ID": 2,
            "Date": "2020-01-03",
            "Amount": 500.21
        }
    ],
    "Customers": [
        {
            "ID": 1,
            "Name": "ABC Ltd"
        },
        {
            "ID": 2,
            "Name": "XYZ Ltd",
            "Contacts": [
                "Mrs Smith",
                "Mr Jones"
            ]
        }
    ]
}

This will be converted into:

CREATE TABLE IF NOT EXISTS complex (
    `ID` INT,
    `Description` VARCHAR(21) CHARACTER SET utf8,
    `Orders_ID` INT,
    `Orders_Date` DATETIME,
    `Orders_Amount` NUMERIC(5, 2),
    `Customers_ID` INT,
    `Customers_Name` VARCHAR(7) CHARACTER SET utf8,
    `Customers_Contacts` VARCHAR(9) CHARACTER SET utf8
);
INSERT INTO complex VALUES
    (1,'top level description',1,'2020-01-02 00:00:00',NULL,NULL,NULL,NULL),
    (1,'top level description',2,'2020-01-03 00:00:00',500.21,NULL,NULL,NULL),
    (1,'top level description',NULL,NULL,NULL,1,'ABC Ltd',NULL),
    (1,'top level description',NULL,NULL,NULL,2,'XYZ Ltd','Mrs Smith'),
    (1,'top level description',NULL,NULL,NULL,2,'XYZ Ltd','Mr Jones');
IF NOT EXISTS (
    select * from sysobjects where name='complex' and xtype='U'
) CREATE TABLE complex (
    [ID] INT,
    [Description] NVARCHAR(21),
    [Orders_ID] INT,
    [Orders_Date] DATETIME,
    [Orders_Amount] NUMERIC(5, 2),
    [Customers_ID] INT,
    [Customers_Name] NVARCHAR(7),
    [Customers_Contacts] NVARCHAR(9)
);
INSERT INTO complex VALUES
    (1,N'top level description',1,'2020-01-02 00:00:00',NULL,NULL,NULL,NULL),
    (1,N'top level description',2,'2020-01-03 00:00:00',500.21,NULL,NULL,NULL),
    (1,N'top level description',NULL,NULL,NULL,1,N'ABC Ltd',NULL),
    (1,N'top level description',NULL,NULL,NULL,2,N'XYZ Ltd',N'Mrs Smith'),
    (1,N'top level description',NULL,NULL,NULL,2,N'XYZ Ltd',N'Mr Jones');
CREATE TABLE IF NOT EXISTS "complex" (
    "ID" INT,
    "Description" TEXT,
    "Orders_ID" INT,
    "Orders_Date" TIMESTAMP,
    "Orders_Amount" NUMERIC(5, 2),
    "Customers_ID" INT,
    "Customers_Name" TEXT,
    "Customers_Contacts" TEXT
);
INSERT INTO "complex" VALUES
    (1,'top level description',1,'2020-01-02 00:00:00',NULL,NULL,NULL,NULL),
    (1,'top level description',2,'2020-01-03 00:00:00',500.21,NULL,NULL,NULL),
    (1,'top level description',NULL,NULL,NULL,1,'ABC Ltd',NULL),
    (1,'top level description',NULL,NULL,NULL,2,'XYZ Ltd','Mrs Smith'),
    (1,'top level description',NULL,NULL,NULL,2,'XYZ Ltd','Mr Jones');

Notice how every row gets its ID and Description column populated. The first two rows represent the two Orders, The next three rows represent the Customer entities. The Customers_Contacts list results into two rows being inserted, each with the values from the second Customer object in the JSON file.

XML Files

SQLizer will work it's way down the nested structure of an XML file, until it finds an element that has no children. When it does it will convert that element into a database row, and include on that row a copy of all the attribute values of its parent elements. For example:

<?xml version="1.0" encoding="utf-8"?>
<Products>
    <Product name="widgets" id="1234">
        <Order id="1" customer_id="9" order_date="01/01/2014"/>
        <Order id="2" customer_id="3" order_date="01/02/2014"/>
        <Order id="3" customer_id="6" order_date="01/03/2014"/>
        <Order id="4" customer_id="12" order_date="01/04/2014"/>
    </Product>
</Products>

In this XML file, each of the Order elements is converted into a row. Each one of those rows has the "widgets" value in its Products_Product_name column, and the value "1234" in its Products_Product_id column. Both of these values come from the order's parent <Product name="widgets" id="1234"> element:

CREATE TABLE IF NOT EXISTS example (
    `Products` INT,
    `Products_Product` INT,
    `Products_Product_name` VARCHAR(7) CHARACTER SET utf8,
    `Products_Product_id` INT,
    `Products_Product_Order` INT,
    `Products_Product_Order_id` INT,
    `Products_Product_Order_customer_id` INT,
    `Products_Product_Order_order_date` DATETIME
);
INSERT INTO example VALUES
    (NULL,NULL,'widgets',1234,NULL,1,9,'2014-01-01 00:00:00'),
    (NULL,NULL,'widgets',1234,NULL,2,3,'2014-02-01 00:00:00'),
    (NULL,NULL,'widgets',1234,NULL,3,6,'2014-03-01 00:00:00'),
    (NULL,NULL,'widgets',1234,NULL,4,12,'2014-04-01 00:00:00');
IF NOT EXISTS (
    select * from sysobjects where name='example' and xtype='U'
) CREATE TABLE example (
    [Products] INT,
    [Products_Product] INT,
    [Products_Product_name] NVARCHAR(7),
    [Products_Product_id] INT,
    [Products_Product_Order] INT,
    [Products_Product_Order_id] INT,
    [Products_Product_Order_customer_id] INT,
    [Products_Product_Order_order_date] DATETIME
);
INSERT INTO example VALUES
    (NULL,NULL,N'widgets',1234,NULL,1,9,'2014-01-01 00:00:00'),
    (NULL,NULL,N'widgets',1234,NULL,2,3,'2014-02-01 00:00:00'),
    (NULL,NULL,N'widgets',1234,NULL,3,6,'2014-03-01 00:00:00'),
    (NULL,NULL,N'widgets',1234,NULL,4,12,'2014-04-01 00:00:00');
CREATE TABLE IF NOT EXISTS "example" (
    "Products" INT,
    "Products_Product" INT,
    "Products_Product_name" TEXT,
    "Products_Product_id" INT,
    "Products_Product_Order" INT,
    "Products_Product_Order_id" INT,
    "Products_Product_Order_customer_id" INT,
    "Products_Product_Order_order_date" TIMESTAMP
);
INSERT INTO "example" VALUES
    (NULL,NULL,'widgets',1234,NULL,1,9,'2014-01-01 00:00:00'),
    (NULL,NULL,'widgets',1234,NULL,2,3,'2014-02-01 00:00:00'),
    (NULL,NULL,'widgets',1234,NULL,3,6,'2014-03-01 00:00:00'),
    (NULL,NULL,'widgets',1234,NULL,4,12,'2014-04-01 00:00:00');

Notice how the column names have been built from the structure of the XML file.