Using JSON schema to validate Hub Metadata

Author

Anna Krystalli

Published

November 11, 2022

load_json_example <- function(file, dir = "modified-hubmeta-examples") {
    jsonlite::read_json(
    here::here("json-schema", dir, file),
    simplifyVector = TRUE, 
    simplifyDataFrame = FALSE)
}

apply_validator <- function(path) {
    validated <- validator(path, verbose = TRUE) 
    if (validated) {
        validated
    } else {
        validated |>
            attr("errors") |> 
            View()
    }
}

File names & paths

simple_mod_file <- "simple-hubmeta-mod.json"
complex_mod_file <- "complex-hubmeta-mod.json"

simple_mod_path <- here::here("json-schema", "modified-hubmeta-examples", simple_mod_file)
complex_mod_path <- here::here("json-schema", "modified-hubmeta-examples", complex_mod_file)

Modified examples

I’ve modified original examples, primarily to enable generalised validation of multiple round specification. This will affect current hubUtils functionality but now is a good time to change it if necessary.

Simple modified example

hub-infrastructure-experiments/json-schema/modified-hubmeta-examples/simple-hubmeta-mod.json

You can navigate the file by clicking below

simple = FileAttachment("modified-hubmeta-examples/simple-hubmeta-mod.json").json()
simple

Complex modified example

hub-infrastructure-experiments/json-schema/modified-hubmeta-examples/complex-hubmeta-mod.json

You can navigate the file by clicking below

complex = FileAttachment("modified-hubmeta-examples/complex-hubmeta-mod.json").json()
complex

Validate

Load the validator from hub-infrastructure-experiments/json-schema/hubmeta-schema.json

You can navigate the schema below

schema = FileAttachment("hubmeta-schema.json").json()
schema
validator <- jsonvalidate::json_validator(
    schema = here::here("json-schema", "hubmeta-schema.json"),
    engine = "ajv"
)
validator(simple_mod_path, verbose = TRUE)
[1] TRUE
validator(complex_mod_path, verbose = TRUE)
[1] TRUE
validator(complex_mod_path, verbose = TRUE) |>
    attr("errors")
NULL

The 3 errors still present when validating the complex hubmeta arises from the fact that I’ve not figure out values which either should be a typed array or could contain a "$ref" = "#defs" value key pair. Given this functionality should be available to any property, I need to figure out how to encode that in the schema (rather than to each property individually).

Experiments to address $ref validation

For context see https://github.com/Infectious-Disease-Modeling-Hubs/schemas/issues/1

# Read JSON into an R list
complex_mod_path <- here::here("json-schema", 
                               "modified-hubmeta-examples", 
                               "complex-hubmeta-mod.json")
json_list <- jsonlite::read_json(complex_mod_path,
                                 simplifyVector = TRUE,
                                 simplifyDataFrame = FALSE
) 

# Attempt at serialising without unboxing. 
# All vectors serialised as arrays
json_list |> jsonlite::toJSON(
    null = "null",
    na = "string",
    pretty = TRUE
)
{
  "rounds": [
    {
      "round_id": ["round-1"],
      "model_tasks": [
        {
          "task_ids": {
            "origin_date": {
              "required": ["2022-09-03"],
              "optional": null
            },
            "scenario_id": {
              "required": [1],
              "optional": null
            },
            "location": {
              "required": [
                {
                  "$ref": ["#/$defs/task_ids/location/us_states"]
                }
              ],
              "optional": ["US"]
            },
            "target": {
              "required": null,
              "optional": ["weekly rate"]
            },
            "horizon": {
              "required": null,
              "optional": [1, 2]
            }
          },
          "output_types": {
            "mean": {
              "type_id": {
                "required": null,
                "optional": ["NA"]
              },
              "value": {
                "type": ["integer"],
                "minimum": [0]
              }
            },
            "quantile": {
              "type_id": {
                "required": [0.25, 0.5, 0.75],
                "optional": [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]
              },
              "value": {
                "type": ["numeric"],
                "minimum": [0],
                "maximum": [1]
              }
            },
            "cdf": {
              "type_id": {
                "required": [10, 20],
                "optional": null
              },
              "value": {
                "type": ["numeric"],
                "minimum": [0],
                "maximum": [1]
              }
            }
          }
        },
        {
          "task_ids": {
            "origin_date": {
              "required": ["2022-09-03"],
              "optional": null
            },
            "scenario_id": {
              "required": [1],
              "optional": null
            },
            "location": {
              "required": [
                {
                  "$ref": ["#/$defs/task_ids/location/us_states"]
                }
              ],
              "optional": ["US"]
            },
            "target": {
              "required": null,
              "optional": ["peak week"]
            },
            "horizon": {
              "required": null,
              "optional": ["NA"]
            }
          },
          "output_types": {
            "cdf": {
              "type_id": {
                "required": ["EW202240", "EW202241", "EW202242", "EW202243", "EW202244", "EW202245", "EW202246", "EW202247", "EW202248", "EW202249", "EW202250", "EW202251", "EW202252", "EW202301", "EW202302", "EW202303", "EW202304", "EW202305", "EW202306", "EW202307", "EW202308", "EW202309", "EW202310", "EW202311", "EW202312", "EW202313", "EW202314", "EW202315", "EW202316", "EW202317", "EW202318", "EW202319", "EW202320"],
                "optional": null
              },
              "value": {
                "type": ["numeric"],
                "minimum": [0]
              }
            }
          }
        }
      ],
      "submissions_due": {
        "start": ["2022-09-01"],
        "end": ["2022-09-05"]
      }
    },
    {
      "round_id": ["round-2"],
      "model_tasks": [
        {
          "task_ids": {
            "origin_date": {
              "required": ["2022-10-01"],
              "optional": null
            },
            "scenario_id": {
              "required": null,
              "optional": [2, 3]
            },
            "location": {
              "required": [
                {
                  "$ref": ["#/$defs/task_ids/location/us_states"]
                }
              ],
              "optional": ["US"]
            },
            "target": {
              "required": null,
              "optional": ["weekly rate"]
            },
            "age_group": {
              "required": null,
              "optional": ["0-5", "6-18", "19-24", "25-64", "65+"]
            },
            "horizon": {
              "required": null,
              "optional": [1, 2]
            }
          },
          "output_types": {
            "quantile": {
              "type_id": {
                "required": [0.25, 0.5, 0.75],
                "optional": [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]
              },
              "value": {
                "type": ["integer"],
                "minimum": [0]
              }
            }
          }
        }
      ],
      "submissions_due": {
        "start": ["2022-09-28"],
        "end": ["2022-10-01"]
      },
      "last_data_date": ["2022-09-30"]
    }
  ],
  "$defs": {
    "task_ids": {
      "location": {
        "us_states": ["01", "02", "04", "05", "06", "08", "09", "10", "11", "12", "13", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50", "51", "53", "54", "55", "56"]
      }
    }
  }
} 
# Attempt at serialising with unboxing. 
# All 1 length vectors serialised as single values
json_list |> jsonlite::toJSON(
    null = "null",
    na = "string",
    pretty = TRUE,
    auto_unbox = TRUE
)
{
  "rounds": [
    {
      "round_id": "round-1",
      "model_tasks": [
        {
          "task_ids": {
            "origin_date": {
              "required": "2022-09-03",
              "optional": null
            },
            "scenario_id": {
              "required": 1,
              "optional": null
            },
            "location": {
              "required": [
                {
                  "$ref": "#/$defs/task_ids/location/us_states"
                }
              ],
              "optional": "US"
            },
            "target": {
              "required": null,
              "optional": "weekly rate"
            },
            "horizon": {
              "required": null,
              "optional": [1, 2]
            }
          },
          "output_types": {
            "mean": {
              "type_id": {
                "required": null,
                "optional": "NA"
              },
              "value": {
                "type": "integer",
                "minimum": 0
              }
            },
            "quantile": {
              "type_id": {
                "required": [0.25, 0.5, 0.75],
                "optional": [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]
              },
              "value": {
                "type": "numeric",
                "minimum": 0,
                "maximum": 1
              }
            },
            "cdf": {
              "type_id": {
                "required": [10, 20],
                "optional": null
              },
              "value": {
                "type": "numeric",
                "minimum": 0,
                "maximum": 1
              }
            }
          }
        },
        {
          "task_ids": {
            "origin_date": {
              "required": "2022-09-03",
              "optional": null
            },
            "scenario_id": {
              "required": 1,
              "optional": null
            },
            "location": {
              "required": [
                {
                  "$ref": "#/$defs/task_ids/location/us_states"
                }
              ],
              "optional": "US"
            },
            "target": {
              "required": null,
              "optional": "peak week"
            },
            "horizon": {
              "required": null,
              "optional": "NA"
            }
          },
          "output_types": {
            "cdf": {
              "type_id": {
                "required": ["EW202240", "EW202241", "EW202242", "EW202243", "EW202244", "EW202245", "EW202246", "EW202247", "EW202248", "EW202249", "EW202250", "EW202251", "EW202252", "EW202301", "EW202302", "EW202303", "EW202304", "EW202305", "EW202306", "EW202307", "EW202308", "EW202309", "EW202310", "EW202311", "EW202312", "EW202313", "EW202314", "EW202315", "EW202316", "EW202317", "EW202318", "EW202319", "EW202320"],
                "optional": null
              },
              "value": {
                "type": "numeric",
                "minimum": 0
              }
            }
          }
        }
      ],
      "submissions_due": {
        "start": "2022-09-01",
        "end": "2022-09-05"
      }
    },
    {
      "round_id": "round-2",
      "model_tasks": [
        {
          "task_ids": {
            "origin_date": {
              "required": "2022-10-01",
              "optional": null
            },
            "scenario_id": {
              "required": null,
              "optional": [2, 3]
            },
            "location": {
              "required": [
                {
                  "$ref": "#/$defs/task_ids/location/us_states"
                }
              ],
              "optional": "US"
            },
            "target": {
              "required": null,
              "optional": "weekly rate"
            },
            "age_group": {
              "required": null,
              "optional": ["0-5", "6-18", "19-24", "25-64", "65+"]
            },
            "horizon": {
              "required": null,
              "optional": [1, 2]
            }
          },
          "output_types": {
            "quantile": {
              "type_id": {
                "required": [0.25, 0.5, 0.75],
                "optional": [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]
              },
              "value": {
                "type": "integer",
                "minimum": 0
              }
            }
          }
        }
      ],
      "submissions_due": {
        "start": "2022-09-28",
        "end": "2022-10-01"
      },
      "last_data_date": "2022-09-30"
    }
  ],
  "$defs": {
    "task_ids": {
      "location": {
        "us_states": ["01", "02", "04", "05", "06", "08", "09", "10", "11", "12", "13", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50", "51", "53", "54", "55", "56"]
      }
    }
  }
} 
# Read JSON into an R list
complex_mod_path <- here::here("json-schema", 
                               "modified-hubmeta-examples", 
                               "complex-hubmeta-mod.json")
json_list <- jsonlite::read_json(complex_mod_path,
                                 simplifyVector = TRUE,
                                 simplifyDataFrame = FALSE
) 

# Create new schema instance
schema <- jsonvalidate::json_schema$new(
    schema = here::here("json-schema", "hubmeta-schema.json"),
    engine = "ajv")

# Use Schema to serialise list to JSON
json <- schema$serialise(json_list)
Error in context_eval(join(src), private$context, serialize, await): TypeError: Cannot convert undefined or null to object
# Use Schema to validate JSON
schema$validate(json)
Error in get_string(json): object 'json' not found