diff --git a/jupyter/parsing.ipynb b/jupyter/parsing.ipynb index ae8299d..aadd640 100644 --- a/jupyter/parsing.ipynb +++ b/jupyter/parsing.ipynb @@ -6,22 +6,23 @@ "metadata": {}, "source": [ "# Basic JSON file parsing\n", - "## Info gathered by the scientist on Experiment 41 \"NEW PLD Deposition Layer\"\n", - "### General info\n", + "## Info gathered by the scientist on eLabFTW\n", + "### Experiment 45 \"NEW PLD Deposition Layer\"\n", + "#### General info\n", "* Date and time of creation\n", "* Category\n", "* Full name of the scientist\n", "* Related items (sample, PLD target)\n", "\n", - "### Instrument\n", - "* Chamber\n", + "#### Instrument\n", + "* Chamber (by ID)\n", "* Laser system\n", "* RHEED system\n", "\n", - "### Process\n", - "* Sample\n", + "#### Process\n", + "* Sample (by ID)\n", "* Layer progressive number\n", - "* Target\n", + "* Target (by ID)\n", "* Heater temperature\n", "* Heater target distance\n", "* Buffer gas\n", @@ -32,14 +33,26 @@ "* Repetition rate\n", "* Thickness\n", "\n", - "### Post annealing\n", + "#### Post annealing\n", "* Buffer gas used in PA\n", "* Process pressure of PA\n", "* Heater temperature of PA\n", "* Duration of PA\n", "\n", - "## Basic parser\n", - "Let's start by loading and printing the contents of Experiment 41's JSON as downloaded from eLabFTW." + "### Chamber\n", + "\n", + "### Sample\n", + "\n", + "### Target" + ] + }, + { + "cell_type": "markdown", + "id": "c6321d97-4c3e-4e73-a3a2-e3f23ae0a733", + "metadata": {}, + "source": [ + "## Brick by brick\n", + "Let's start by loading and printing the contents of Experiment 45's JSON as downloaded from eLabFTW." ] }, { @@ -632,7 +645,7 @@ "id": "4a7ff14f-d2fc-4485-a174-a23248791a6f", "metadata": {}, "source": [ - "Now entering the second layer: Experiment 43.\n", + "Now entering the second layer: Experiment 46.\n", "\n", "If I were to create a \"layers\" dictionary with the same info from the two different experiments it would look like this:" ] @@ -743,7 +756,7 @@ "* Names every layer \"layer_X\" where X is the progressive number starting from 1 (not 0).\n", "\n", "### Multiple layers from uncategorized files\n", - "Supposing I don't know that files *experiment_41_elab.json* and *experiment_43_elab.json* contain data of layers 1 and 2 of the same sample NA-26-001 I can always load every file in the folder indiscriminately and:\n", + "Supposing I don't know that files *experiment_45_elab.json* and *experiment_46_elab.json* contain data of layers 1 and 2 of the same sample NA-26-001 I can always load every file in the folder indiscriminately and:\n", "* Filter out every non-eLabFTW file (by some recognition pattern).\n", "* Group the data by the sample it's associated to.\n", "\n", @@ -762,8 +775,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "../tests/objects/experiment_46_elab.json\n", - "../tests/objects/experiment_45_elab.json\n" + "../tests/objects/experiment_45_elab.json\n", + "../tests/objects/experiment_46_elab.json\n" ] } ], @@ -811,7 +824,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{855: {'instrument': {'deposition_chamber': 72, 'laser_system': 'Excimer ', 'rheed_system': 'staib'}, 'multilayer': {'layer_2': {'operator': 'Emiliano Di Gennaro', 'created_at': '2026-01-20 16:18:48', 'sample': {'type': 'items', 'value': 855, 'group_id': 4, 'position': 0}, 'temperature': {'type': 'number', 'unit': '°C', 'units': ['°C'], 'value': '500', 'group_id': 4, 'position': 7}, 'target': {'type': 'items', 'value': 466, 'group_id': 4, 'position': 2, 'required': True}}, 'layer_1': {'operator': 'Emiliano Di Gennaro', 'created_at': '2026-01-20 16:11:32', 'sample': {'type': 'items', 'value': 855, 'group_id': 4, 'position': 0}, 'temperature': {'type': 'number', 'unit': '°C', 'units': ['°C'], 'value': '760', 'group_id': 4, 'position': 7}, 'target': {'type': 'items', 'value': 854, 'group_id': 4, 'position': 2, 'required': True}}}}}\n" + "{855: {'instrument': {'deposition_chamber': 72, 'laser_system': 'Excimer ', 'rheed_system': 'staib'}, 'multilayer': {'layer_1': {'operator': 'Emiliano Di Gennaro', 'created_at': '2026-01-20 16:11:32', 'sample': {'type': 'items', 'value': 855, 'group_id': 4, 'position': 0}, 'temperature': {'type': 'number', 'unit': '°C', 'units': ['°C'], 'value': '760', 'group_id': 4, 'position': 7}, 'target': {'type': 'items', 'value': 854, 'group_id': 4, 'position': 2, 'required': True}}, 'layer_2': {'operator': 'Emiliano Di Gennaro', 'created_at': '2026-01-20 16:18:48', 'sample': {'type': 'items', 'value': 855, 'group_id': 4, 'position': 0}, 'temperature': {'type': 'number', 'unit': '°C', 'units': ['°C'], 'value': '500', 'group_id': 4, 'position': 7}, 'target': {'type': 'items', 'value': 466, 'group_id': 4, 'position': 2, 'required': True}}}}}\n" ] } ], @@ -853,7 +866,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "id": "6a5281dc-7fc3-4de7-845c-2bc2b54d4bb1", "metadata": {}, "outputs": [], @@ -862,10 +875,11 @@ "\n", "def find_missing(lst):\n", " '''\n", - " Finds missing integers in sorted list.\n", - " Time complexity is NlogN but since N is at most 10^2 it's not a problem for us.\n", + " Finds missing integers in unsorted list.\n", + " Time complexity is NlogN but since N is at most 10^1 it's not a problem for us.\n", " Source: geekforgeeks.org.\n", " '''\n", + " lst.sort() # sorts list\n", " return sorted(set(range(lst[0], lst[-1])) - set(lst))\n", "\n", "for item in sample_dict:\n", @@ -876,7 +890,42 @@ " print(\"Warning: some layers appear to be missing.\")\n", " print(f\"The missing layers are: \")\n", " for i in missing:\n", - " print(f\"* layer_{i}\")\n" + " print(f\"* layer_{i}\")" + ] + }, + { + "cell_type": "markdown", + "id": "028ac2b1-3389-472d-ba05-de6cfc9a9fda", + "metadata": {}, + "source": [ + "#### Find duplicates" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "14583887-8feb-4507-a06d-ddc557c0a875", + "metadata": {}, + "outputs": [], + "source": [ + "def find_duplicates(lst): # list of integers\n", + " result = []\n", + " lst.sort() # sort list just in case\n", + " for i in range(len(lst)-1):\n", + " #print(lst[i]) # debug\n", + " if lst[i] == lst[i+1]:\n", + " result.append(lst[i])\n", + " return sorted(set(result))\n", + "\n", + "for item in sample_dict:\n", + " layer_names = list(sample_dict[item].get(\"multilayer\").keys())\n", + " numbers = sorted(int(layer.split('_')[1]) for layer in layer_names)\n", + " dupes = find_duplicates(numbers)\n", + " if dupes:\n", + " print(\"Warning: some layers are duplicated.\")\n", + " print(f\"The duplicate layers are: \")\n", + " for i in dupes:\n", + " print(f\"* layer_{i}\")" ] }, { @@ -892,7 +941,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "24793e2b-67bb-4e8b-9d93-7802d3af7fca", "metadata": { "scrolled": true @@ -976,10 +1025,295 @@ "print(sample_title)" ] }, + { + "cell_type": "markdown", + "id": "ba4f0459-8da0-494d-b0c2-23dae509538c", + "metadata": {}, + "source": [ + "Now all that's left for us to do is merge the results to create a single dictionary with the name of the sample and its different layers." + ] + }, + { + "cell_type": "markdown", + "id": "d714bde9-73a2-4365-b54e-d129533aa3de", + "metadata": {}, + "source": [ + "## Basic parser\n", + "The parser needs:\n", + "* The code from the section \"*Multiple layers from uncategorized files*\" responsible for fetching and grouping data on the layers.\n", + "* The `find_missing` and `find_duplicates` functions.\n", + "* The code from the previous section to collect the names of the samples." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "daa59593-fd40-4b8a-b7f5-5cdbd6482fc3", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Paste API key here: ········\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Na-26-001\": {\n", + " \"instrument\": {\n", + " \"deposition_chamber\": 72,\n", + " \"laser_system\": \"Excimer \",\n", + " \"rheed_system\": \"staib\"\n", + " },\n", + " \"multilayer\": {\n", + " \"layer_1\": {\n", + " \"operator\": \"Emiliano Di Gennaro\",\n", + " \"created_at\": \"2026-01-20 16:11:32\",\n", + " \"sample\": {\n", + " \"type\": \"items\",\n", + " \"value\": 855,\n", + " \"group_id\": 4,\n", + " \"position\": 0\n", + " },\n", + " \"temperature\": {\n", + " \"type\": \"number\",\n", + " \"unit\": \"\\u00b0C\",\n", + " \"units\": [\n", + " \"\\u00b0C\"\n", + " ],\n", + " \"value\": \"760\",\n", + " \"group_id\": 4,\n", + " \"position\": 7\n", + " },\n", + " \"target\": {\n", + " \"type\": \"items\",\n", + " \"value\": 854,\n", + " \"group_id\": 4,\n", + " \"position\": 2,\n", + " \"required\": true\n", + " }\n", + " },\n", + " \"layer_2\": {\n", + " \"operator\": \"Emiliano Di Gennaro\",\n", + " \"created_at\": \"2026-01-20 16:18:48\",\n", + " \"sample\": {\n", + " \"type\": \"items\",\n", + " \"value\": 855,\n", + " \"group_id\": 4,\n", + " \"position\": 0\n", + " },\n", + " \"temperature\": {\n", + " \"type\": \"number\",\n", + " \"unit\": \"\\u00b0C\",\n", + " \"units\": [\n", + " \"\\u00b0C\"\n", + " ],\n", + " \"value\": \"500\",\n", + " \"group_id\": 4,\n", + " \"position\": 7\n", + " },\n", + " \"target\": {\n", + " \"type\": \"items\",\n", + " \"value\": 466,\n", + " \"group_id\": 4,\n", + " \"position\": 2,\n", + " \"required\": true\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "import os, json, requests\n", + "from getpass import getpass\n", + "\n", + "def valid_elabfiles(path):\n", + " '''Lookup directory \"path\" and\n", + " returns list of valid eLabFTW\n", + " Experiment JSON files.'''\n", + " elabfiles = []\n", + " for filename in os.listdir(path):\n", + " if filename.endswith(\".json\"):\n", + " try:\n", + " with open(os.path.join(path, filename), \"r\") as f:\n", + " data = json.load(f)\n", + " if data.get(\"elabid\"): # insert specific NeXus requirements here later\n", + " \n", + " elabfiles.append(filename)\n", + " f.close()\n", + " except json.decoder.JSONDecodeError as e: # invalid files \"masked\" as JSON\n", + " #print(f\"wait a moment: {e}\") # just for debug\n", + " pass\n", + " return elabfiles\n", + "\n", + "def call_sample(apikey, elabid, SERVER_URL=\"https://elabftw.fisica.unina.it/\"): # TO-DO: rm default server\n", + " '''Queries the Resources (/items) API endpoint\n", + " of eLabFTW instance to request data (JSON)\n", + " on a certain sample given its eLab-ID.\n", + " \n", + " Requires an active (RO/RW) API key.\n", + " Defaults to elabftw.fisica.unina.it.'''\n", + " full_elab_url = f\"{SERVER_URL}api/v2\" # API endpoint root for eLabFTW\n", + " items_url = f\"{full_elab_url}/items\" # API endpoint /items\n", + " header = {\n", + " \"Authorization\": apikey,\n", + " \"Content-Type\": \"application/json\"\n", + " }\n", + " sample = requests.get(\n", + " headers=header,\n", + " url=f\"{items_url}/{elabid}\",\n", + " verify=True\n", + " )\n", + " return sample.json()\n", + "\n", + "def id2sample(apikey, elabid):\n", + " '''Fetches sample data (JSON) from eLabFTW\n", + " instance (using function \"call_sample()\")\n", + " and extracts significant information.\n", + " \n", + " Currently, it only returns the sample's title.'''\n", + " #apikey = getpass(\"Paste API key here: \") # move outside loops\n", + " sample_data = call_sample(apikey, elabid)\n", + " sample_title = sample_data[\"title\"]\n", + " return sample_title\n", + "\n", + "def fetch_and_group(path):\n", + " '''Fetches experiment data from eLabFTW JSON\n", + " files in a given folder, then \n", + " '''\n", + " sample_dict = {}\n", + " apikey = getpass(\"Paste API key here: \")\n", + " for filename in valid_elabfiles(path):\n", + " with open(os.path.join(path, filename), \"r\") as f:\n", + " layer = json.load(f)\n", + " extra = layer[\"metadata_decoded\"][\"extra_fields\"]\n", + " sample_id = extra[\"Sample\"][\"value\"]\n", + " sample_title = id2sample(apikey, sample_id)\n", + " lpn = int(extra[\"Layer Progressive Number\"][\"value\"]) # Layer Progressive Number\n", + " if not sample_dict.get(sample_title): # if not existent yet, initialize\n", + " sample_dict[sample_title] = {\n", + " \"instrument\": {\n", + " \"deposition_chamber\": extra[\"Chamber\"][\"value\"], # ID of associated resource (PLD chamber) - useless as is!\n", + " \"laser_system\": extra[\"Laser System\"][\"value\"],\n", + " \"rheed_system\": extra[\"RHEED System\"][\"value\"]\n", + " },\n", + " \"multilayer\": {}\n", + " }\n", + " sample_dict[sample_title][\"multilayer\"][f\"layer_{lpn}\"] = {\n", + " \"operator\": layer[\"fullname\"],\n", + " \"created_at\": layer[\"created_at\"],\n", + " \"sample\": extra[\"Sample\"], # ID of associated sample - useless as is!\n", + " \"temperature\": extra[\"Heater temperature \"], # space at the end is a config error in eLab!\n", + " \"target\": extra[\"Target\"]\n", + " }\n", + " return sample_dict\n", + "\n", + "\n", + "sample_dict = fetch_and_group(\"../tests/objects\")\n", + "print(json.dumps(sample_dict, indent=3))" + ] + }, + { + "cell_type": "markdown", + "id": "de1b1870-7fc3-4ee5-8cce-c098e5bf909a", + "metadata": {}, + "source": [ + "For debug purposes, let's see which info is included in the sample_dict dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "0fc6e88f-881d-413a-bfe1-377213f7dda2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Info about sample Na-26-001:\n", + "* The deposition chamber is 72.\n", + "* The laser system is EXCIMER.\n", + "* The RHEED system is STAIB.\n", + "\n", + "## Layers of Na-26-001:\n", + "\n", + "### layer_1\n", + "* It was created at 2026-01-20 16:11:32.\n", + "* The operator was Emiliano Di Gennaro.\n", + "* The deposition temperature was 760 °C.\n", + "* The target eLabID was 854.\n", + "\n", + "### layer_2\n", + "* It was created at 2026-01-20 16:18:48.\n", + "* The operator was Emiliano Di Gennaro.\n", + "* The deposition temperature was 500 °C.\n", + "* The target eLabID was 466.\n" + ] + } + ], + "source": [ + "for sample in sample_dict:\n", + " print(f\"# Info about sample {sample}:\")\n", + " multilayer = sample_dict[sample][\"multilayer\"]\n", + " instrument = sample_dict[sample][\"instrument\"]\n", + " deposition_chamber = instrument[\"deposition_chamber\"] # integer\n", + " laser_system = str(instrument[\"laser_system\"]).strip().upper() # string\n", + " rheed_system = str(instrument[\"rheed_system\"]).strip().upper() # string\n", + " \n", + " print(f\"* The deposition chamber is {deposition_chamber}.\")\n", + " print(f\"* The laser system is {laser_system}.\")\n", + " print(f\"* The RHEED system is {rheed_system}.\")\n", + " print(f\"\\n## Layers of {sample}:\")\n", + " for layer in multilayer:\n", + " print(f\"\\n### {layer}\")\n", + " layerdata = multilayer[layer]\n", + " operator = layerdata[\"operator\"]\n", + " created_at = layerdata[\"created_at\"]\n", + " temperature = layerdata[\"temperature\"][\"value\"]\n", + " temperature_unit = layerdata[\"temperature\"][\"unit\"]\n", + " target = layerdata[\"target\"][\"value\"]\n", + " \n", + " print(f\"* It was created at {created_at}.\")\n", + " print(f\"* The operator was {operator}.\")\n", + " print(f\"* The deposition temperature was {temperature} {temperature_unit}.\")\n", + " print(f\"* The target eLabID was {target}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "e16c6f3d-cc3a-45c1-9988-bbf0de3baf08", + "metadata": {}, + "source": [ + "## To the next level: creating a dictionary with the same hierarchy as the final NeXus file\n", + "\n", + "```\n", + "pld_fabrication\n", + "|-sample\n", + "| |-substrate\n", + "| | |-name\n", + "| |-multilayer\n", + "| | |-LAYER\n", + "| | | |-target\n", + "| | | | |-name\n", + "| | | | |-chemical_formula\n", + "\n", + "```" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "ba4f0459-8da0-494d-b0c2-23dae509538c", + "id": "a615c496-8cb9-451f-a088-beb4672379bf", "metadata": {}, "outputs": [], "source": [] @@ -1001,7 +1335,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.12" + "version": "3.12.3" } }, "nbformat": 4,