Preparing taxonomy mappings¶

Goal: prepare your own mapping files¶

This notebook aims to assist in the understanding of mapping files and aid the user in preparing their own mapping file for risks from a given taxonomy.

Dependencies¶

Tip: Ensure you have followed installation instructions for the risk_atlas_nexus library

git clone git@github.com:IBM/risk-atlas-nexus.git
cd risk-atlas-nexus
python -m venv vrisk-atlas-nexus
source vrisk-atlas-nexus/bin/activate
pip install -e .

In [ ]:

Copied!





import os

from rich import print
from sssom_schema import Mapping, MappingSet
from sssom.sssom_document import MappingSetDocument 
from sssom.util import MappingSetDataFrame 
from sssom.writers import write_table
from curies import Converter
from enum import Enum

from risk_atlas_nexus import RiskAtlasNexus

from risk_atlas_nexus.blocks.inference import (
    RITSInferenceEngine,
    WMLInferenceEngine,
    OllamaInferenceEngine,
    VLLMInferenceEngine,
)
from risk_atlas_nexus.blocks.inference.params import (
    InferenceEngineCredentials,
    RITSInferenceEngineParams,
    WMLInferenceEngineParams,
    OllamaInferenceEngineParams,
    VLLMInferenceEngineParams,
)
import os

from rich import print
from sssom_schema import Mapping, MappingSet
from sssom.sssom_document import MappingSetDocument 
from sssom.util import MappingSetDataFrame 
from sssom.writers import write_table
from curies import Converter
from enum import Enum

from risk_atlas_nexus import RiskAtlasNexus

from risk_atlas_nexus.blocks.inference import (
    RITSInferenceEngine,
    WMLInferenceEngine,
    OllamaInferenceEngine,
    VLLMInferenceEngine,
)
from risk_atlas_nexus.blocks.inference.params import (
    InferenceEngineCredentials,
    RITSInferenceEngineParams,
    WMLInferenceEngineParams,
    OllamaInferenceEngineParams,
    VLLMInferenceEngineParams,
)

Introduction¶

How are mappings stored in Risk Atlas Nexus?¶

To express some semantically meaningful mapping between risks from different taxonomies, Risk Atlas Nexus makes use of the Simple Standard for Sharing Ontological Mappings (SSSOM) . The mappings are maintained in SSOM TSV files and are converted to LinkML data YAML using Python helper scripts.

Anatomy of a TSV file¶

A SSSOM/TSV file contains one mapping set object, composed of two different parts:

the metadata block, which contains essentially all the slots of a MappingSet class except the mappings slot;
the mappings block (also called the TSV section), which contains the individual mappings.

Find out more¶

Read about The SSSOM/TSV serialisation format

Scenario: prepare new mapping file¶

Consider a case where you would like to generate mappings for your new list of risk against risks already in the nexus graph. In this notebook we can see how to prepare a TSV file either:

Manually
Experimental: semi-automatically, with the aid of library functions as shown below

Note: In both cases it is strongly recommended mappings should be carefully reviewed before being used or contributed to the Risk Atlas Nexus project.

Helper functions¶

A few utility functions to generate the mapping block output are provided below.

In [ ]:

Copied!





#inference_engine = None

inference_engine = RITSInferenceEngine(
    model_name_or_path="ibm/granite-20b-code-instruct",
    credentials={
        "api_key": "<YOUR_API_KEY>",
        "api_url": "<YOUR_API_URL>",
    },
    parameters=RITSInferenceEngineParams(max_tokens=1000, temperature=0.7),
)

class MappingMethod(Enum):
    SEMANTIC = "SEMANTIC"
    INFERENCE = "INFERENCE"


def prepare_mapping_metadata(cm) -> MappingSet:
    mapping_set_metadata = MappingSet(license=cm["license"], curie_map=cm["curie_map"], mapping_set_id=cm["mapping_set_id"], mapping_set_description=cm["mapping_set_description"], mapping_date=cm["mapping_date"])
    return mapping_set_metadata

def prepare_mapping_block(new_risks, existing_risks, new_prefix, mapping_method=MappingMethod.SEMANTIC):
    ran = RiskAtlasNexus()
    mappings = ran.generate_proposed_mappings(new_risks=new_risks, existing_risks=existing_risks, inference_engine=inference_engine, new_prefix=new_prefix, mapping_method=mapping_method)
    return mappings

def combine_blocks_and_write_to_file(cm, metadata, mappings, path):
    metadata.mappings = mappings
    converter = Converter.from_prefix_map(cm["curie_map"])
    document = MappingSetDocument(mapping_set=metadata, converter=converter)
    print(f"\n# The mapping set document instance has been prepared.") 
    msdf = MappingSetDataFrame.from_mapping_set_document(document)
    print(f"\n# The mapping set dataframe instance has been prepared.") 
    with open(path, "w") as tmp_file:
        write_table(msdf, tmp_file)
#inference_engine = None

inference_engine = RITSInferenceEngine(
    model_name_or_path="ibm/granite-20b-code-instruct",
    credentials={
        "api_key": "",
        "api_url": "",
    },
    parameters=RITSInferenceEngineParams(max_tokens=1000, temperature=0.7),
)

class MappingMethod(Enum):
    SEMANTIC = "SEMANTIC"
    INFERENCE = "INFERENCE"


def prepare_mapping_metadata(cm) -> MappingSet:
    mapping_set_metadata = MappingSet(license=cm["license"], curie_map=cm["curie_map"], mapping_set_id=cm["mapping_set_id"], mapping_set_description=cm["mapping_set_description"], mapping_date=cm["mapping_date"])
    return mapping_set_metadata

def prepare_mapping_block(new_risks, existing_risks, new_prefix, mapping_method=MappingMethod.SEMANTIC):
    ran = RiskAtlasNexus()
    mappings = ran.generate_proposed_mappings(new_risks=new_risks, existing_risks=existing_risks, inference_engine=inference_engine, new_prefix=new_prefix, mapping_method=mapping_method)
    return mappings

def combine_blocks_and_write_to_file(cm, metadata, mappings, path):
    metadata.mappings = mappings
    converter = Converter.from_prefix_map(cm["curie_map"])
    document = MappingSetDocument(mapping_set=metadata, converter=converter)
    print(f"\n# The mapping set document instance has been prepared.") 
    msdf = MappingSetDataFrame.from_mapping_set_document(document)
    print(f"\n# The mapping set dataframe instance has been prepared.") 
    with open(path, "w") as tmp_file:
        write_table(msdf, tmp_file)

Creating a TSV file:¶

Prepare the metadata block¶

The variables declared in yaml in the cell below will be used to below to create a MappingSet instance to create the mapping metadata. Edit them for your case, and remember to add your new prefix to curie map.

In [ ]:

Copied!





import yaml
cm = yaml.safe_load("""
curie_map:
 ibm-risk-atlas: https://www.ibm.com/docs/en/watsonx/saas?topic=
 semapv: https://w3id.org/semapv/vocab/
 skos: http://www.w3.org/2004/02/skos/core#
 ailuminate-v1.0: https://mlcommons.org/ailuminate/
 credo-ucf: https://arxiv.org/pdf/2503.05937v1/
 ibm-granite-guardian: https://arxiv.org/abs/2412.07724
 ibm-risk-atlas: https://www.ibm.com/docs/en/watsonx/saas?topic=
 nist-ai-rmf: https://www.nist.gov/itl/ai-risk-management-framework/
 mit-ai-risk-repository: https://airisk.mit.edu/
 owasp-llm-2.0: https://owasp.org/www-project-top-10-for-large-language-model-applications/
mapping_set_id: https://github.com/IBM/risk-atlas-nexus/tree/main/src/data/mappings/ailuminate.tsv
mapping_set_description: Mapping from IBM AI Risk Atlas to AILuminate benchmark
license: https://www.apache.org/licenses/LICENSE-2.0.html
mapping_date: "2025-01-29"
""")

print(f"\n# The YAML you will use has been prepared.") 
print(cm)
import yaml
cm = yaml.safe_load("""
curie_map:
 ibm-risk-atlas: https://www.ibm.com/docs/en/watsonx/saas?topic=
 semapv: https://w3id.org/semapv/vocab/
 skos: http://www.w3.org/2004/02/skos/core#
 ailuminate-v1.0: https://mlcommons.org/ailuminate/
 credo-ucf: https://arxiv.org/pdf/2503.05937v1/
 ibm-granite-guardian: https://arxiv.org/abs/2412.07724
 ibm-risk-atlas: https://www.ibm.com/docs/en/watsonx/saas?topic=
 nist-ai-rmf: https://www.nist.gov/itl/ai-risk-management-framework/
 mit-ai-risk-repository: https://airisk.mit.edu/
 owasp-llm-2.0: https://owasp.org/www-project-top-10-for-large-language-model-applications/
mapping_set_id: https://github.com/IBM/risk-atlas-nexus/tree/main/src/data/mappings/ailuminate.tsv
mapping_set_description: Mapping from IBM AI Risk Atlas to AILuminate benchmark
license: https://www.apache.org/licenses/LICENSE-2.0.html
mapping_date: "2025-01-29"
""")

print(f"\n# The YAML you will use has been prepared.") 
print(cm)

In [ ]:

Copied!

ms_metadata = prepare_mapping_metadata(cm)
print(f"\n# The mapping set metadata instance has been prepared.") 
print(ms_metadata)
ms_metadata = prepare_mapping_metadata(cm)
print(f"\n# The mapping set metadata instance has been prepared.") 
print(ms_metadata)

Manual creation¶

You can choose to prepare a list of mappings manually, to populate the mapping block. These should be in Mapping format.

In [ ]:

Copied!





# Prepare the mapping block (manual_mb)
m1 = Mapping(predicate_id='skos:relatedMatch', mapping_justification='semapv:ManualMappingCuration', subject_id='new_prefix:my-risk-1-id', subject_label='Violent Crimes', object_id='ibm-risk-atlas:atlas-harmful-output', object_label=' Harmful output', author_id=['my_author_email_address'], mapping_date='2025-03-31', comment='A sample mapping')
m2 = Mapping(predicate_id='rdfs:seeAlso', mapping_justification='semapv:ManualMappingCuration', subject_id='new_prefix:my-risk-2-id', subject_label='Nonviolent Crimes', object_id='ibm-risk-atlas:atlas-harmful-output', object_label=' Harmful output', author_id=['my_author_email_address'], mapping_date='2025-03-31',  comment='A sample mapping')
manual_mb = [m1, m2]

# bring it together with metadata and write to file
tmp_path = os.path.join("test_write_sssom_dataframe_manual.tsv")
combine_blocks_and_write_to_file(cm=cm, metadata=ms_metadata, mappings=manual_mb, path=tmp_path)
# Prepare the mapping block (manual_mb)
m1 = Mapping(predicate_id='skos:relatedMatch', mapping_justification='semapv:ManualMappingCuration', subject_id='new_prefix:my-risk-1-id', subject_label='Violent Crimes', object_id='ibm-risk-atlas:atlas-harmful-output', object_label=' Harmful output', author_id=['my_author_email_address'], mapping_date='2025-03-31', comment='A sample mapping')
m2 = Mapping(predicate_id='rdfs:seeAlso', mapping_justification='semapv:ManualMappingCuration', subject_id='new_prefix:my-risk-2-id', subject_label='Nonviolent Crimes', object_id='ibm-risk-atlas:atlas-harmful-output', object_label=' Harmful output', author_id=['my_author_email_address'], mapping_date='2025-03-31',  comment='A sample mapping')
manual_mb = [m1, m2]

# bring it together with metadata and write to file
tmp_path = os.path.join("test_write_sssom_dataframe_manual.tsv")
combine_blocks_and_write_to_file(cm=cm, metadata=ms_metadata, mappings=manual_mb, path=tmp_path)

Automatic creation¶

Alternatively, you may choose to prepare a list of mappings semi-automatically by populating the mapping block using library methods. This takes as input two lists of risks which are to be mapped to each other.

Two methods are available to propose mappings:

Semantic (queries an embedding of available risks)
Inference (LLM query to find if risks might be related)

In [ ]:

Copied!





# Set up Risk Atlas Nexus with all risks or the subset of risks you want to map to.
# In this case, specify IBM AI Risk Atlas only
ran = RiskAtlasNexus()
all_risks = ran.get_all_risks(taxonomy="ibm-risk-atlas")

print(f"\n# The taxonomy ibm-risk-atlas has {len(all_risks)} risks you can map to.") # 67
print(all_risks[:2])

# Set up a second instance Risk Atlas Nexus with only the risks you are proposing to map

# If your risks exist already in the Atlas, just do:
my_taxonomy_id = "ailuminate-v1.0" # for example
risks_to_map = ran.get_all_risks(taxonomy=my_taxonomy_id)

# else if they do not yet exist, use the lines below
'''
# Create an instance which extends the graph with your custom definitions
my_base_dir='<my_user_input_dir_path>' # path where your custom yaml is
my_extended_ran = RiskAtlasNexus(base_dir=my_base_dir)
risks_to_map = my_extended_ran.get_all_risks(taxonomy="<my-taxonomy-id>")
'''

print(f"\n# The taxonomy <my-taxonomy-id> has {len(risks_to_map)} risks you can map to.") 
print(risks_to_map[:2])

auto_semantic_mb = prepare_mapping_block(risks_to_map, all_risks, "new_prefix", "SEMANTIC")
print(f"\n# The mapping set block instance has been prepared.") 

# bring it together with metadata and write to file
tmp_path = os.path.join("test_write_sssom_dataframe_automatic_semantic.tsv")
combine_blocks_and_write_to_file(cm=cm, metadata=ms_metadata, mappings=auto_semantic_mb, path=tmp_path)
# Set up Risk Atlas Nexus with all risks or the subset of risks you want to map to.
# In this case, specify IBM AI Risk Atlas only
ran = RiskAtlasNexus()
all_risks = ran.get_all_risks(taxonomy="ibm-risk-atlas")

print(f"\n# The taxonomy ibm-risk-atlas has {len(all_risks)} risks you can map to.") # 67
print(all_risks[:2])

# Set up a second instance Risk Atlas Nexus with only the risks you are proposing to map

# If your risks exist already in the Atlas, just do:
my_taxonomy_id = "ailuminate-v1.0" # for example
risks_to_map = ran.get_all_risks(taxonomy=my_taxonomy_id)

# else if they do not yet exist, use the lines below
'''
# Create an instance which extends the graph with your custom definitions
my_base_dir='' # path where your custom yaml is
my_extended_ran = RiskAtlasNexus(base_dir=my_base_dir)
risks_to_map = my_extended_ran.get_all_risks(taxonomy="")
'''

print(f"\n# The taxonomy  has {len(risks_to_map)} risks you can map to.") 
print(risks_to_map[:2])

auto_semantic_mb = prepare_mapping_block(risks_to_map, all_risks, "new_prefix", "SEMANTIC")
print(f"\n# The mapping set block instance has been prepared.") 

# bring it together with metadata and write to file
tmp_path = os.path.join("test_write_sssom_dataframe_automatic_semantic.tsv")
combine_blocks_and_write_to_file(cm=cm, metadata=ms_metadata, mappings=auto_semantic_mb, path=tmp_path)

Usecase: Get proposed risk mappings across the taxonomies¶

The following code is a starter example of how you might script the production of suggested mappings across the taxonomies and write it out to TSV files.

In [ ]:

Copied!





ran = RiskAtlasNexus()

new_t_ids = [
    "my-new-taxonomy-1",
    "my-new-taxonomy-2",
]

# if there were errors or service interruptions before all completion, 
# add the completed taxonomies here so they are not mapped again
completed = []

for t_id in new_t_ids:
    mappings = []
    if t_id not in completed:
        risks_to_map = ran.get_all_risks(taxonomy=t_id)
        print(f"\n# Proposing matches for taxonomy {t_id} ... {len(risks_to_map)}...") 
        for taxononomy in ran.get_all_taxonomies():
            if (taxononomy.id != t_id) and (taxononomy.id !='ai-risk-taxonomy'):
                t_risks = ran.get_all_risks(taxonomy=taxononomy.id)
                print(f"\n# processing {len(t_risks)} from taxonomy {taxononomy.id}.") 
                auto_semantic_mb = prepare_mapping_block(risks_to_map, t_risks, taxononomy.id, "SEMANTIC")
                auto_rits_mb = prepare_mapping_block(risks_to_map, t_risks, taxononomy.id, "INFERENCE")
                print(f"\n# appending sem: {len(auto_semantic_mb)}, rits: {len(auto_rits_mb)}.") 
                mappings = mappings + auto_semantic_mb + auto_rits_mb
            else:
                pass
    if t_id not in completed:       
        # bring it together with metadata and write to file
        tmp_path = os.path.join("test_write_sssom_auto_" + t_id + ".tsv")
        combine_blocks_and_write_to_file(cm=cm, metadata=ms_metadata, mappings=mappings, path=tmp_path)
        print(f"\n# Writing {len(mappings)} mappings to {tmp_path}.") 
ran = RiskAtlasNexus()

new_t_ids = [
    "my-new-taxonomy-1",
    "my-new-taxonomy-2",
]

# if there were errors or service interruptions before all completion, 
# add the completed taxonomies here so they are not mapped again
completed = []

for t_id in new_t_ids:
    mappings = []
    if t_id not in completed:
        risks_to_map = ran.get_all_risks(taxonomy=t_id)
        print(f"\n# Proposing matches for taxonomy {t_id} ... {len(risks_to_map)}...") 
        for taxononomy in ran.get_all_taxonomies():
            if (taxononomy.id != t_id) and (taxononomy.id !='ai-risk-taxonomy'):
                t_risks = ran.get_all_risks(taxonomy=taxononomy.id)
                print(f"\n# processing {len(t_risks)} from taxonomy {taxononomy.id}.") 
                auto_semantic_mb = prepare_mapping_block(risks_to_map, t_risks, taxononomy.id, "SEMANTIC")
                auto_rits_mb = prepare_mapping_block(risks_to_map, t_risks, taxononomy.id, "INFERENCE")
                print(f"\n# appending sem: {len(auto_semantic_mb)}, rits: {len(auto_rits_mb)}.") 
                mappings = mappings + auto_semantic_mb + auto_rits_mb
            else:
                pass
    if t_id not in completed:       
        # bring it together with metadata and write to file
        tmp_path = os.path.join("test_write_sssom_auto_" + t_id + ".tsv")
        combine_blocks_and_write_to_file(cm=cm, metadata=ms_metadata, mappings=mappings, path=tmp_path)
        print(f"\n# Writing {len(mappings)} mappings to {tmp_path}.") 

Next steps¶

Verify the mapping files (essential human verification)
Lift them to YAML format (using the commands in the makefile)
Save them in the knowledge graph data mapping folder