Skip to content

To Fonduer

Convert Label Studio annotations to Fonduer Annotations.

LabelStudioExport

Representation of a Label Studio export.

This object contains all necessary information from a label studio export to transfer the labeled entities to the Fonduer data model. Additionally, supplementing functions to determine the set of labels and their ngram size are available.

Parameters:

Name Type Description Default
documents List[LabelStudioDocument]

Parsed documents that are labeled in the Label

required
file_path str

Path to the export.json file from Label Studio.

required
Source code in LabelstudioToFonduer/to_fonduer.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class LabelStudioExport:
    """Representation of a Label Studio export.

    This object contains all necessary information from a label studio export to transfer the
    labeled entities to the Fonduer data model. Additionally, supplementing functions to
    determine the set of labels and their ngram size are available.

    Args:
        documents (List[LabelStudioDocument]): Parsed documents that are labeled in the Label
        Studio export.
        file_path (str): Path to the `export.json` file from Label Studio.
    """
    def __init__(self, documents: List[LabelStudioDocument], file_path: str) -> None:
        self.documents = documents
        self.file_path = file_path

    def label(self) -> Set[str]:
        """Get a set of all labels that were assigned in Label Studio.

        Returns:
            Set[str]: Set off all different labels in the export.
        """
        labels = set()
        for document in self.documents:
            for entity in document.entities:
                labels.add(entity.label)
        return labels

    def ngrams(self, label: str) -> Tuple[int, int]:
        """Determine the ngarm size of a label.

        All entities in all documents are split on whitespaces. The length of the shortest and
        longest entity is reported as ngram size.

        ???+ warning "Extra padding needed"

            Fonduer might split sentences differently. Therefore, special characters not
            labeled might also count as tokens. Therefore, a slightly longer ngram size might be
            needed to account for these tokens.

        Args:
            label (str): The label the ngram size should be determined for.

        Returns:
            Tuple[int, int]: A tuple of the minimal and maximal ngram size.
        """
        lengths = set()
        for document in self.documents:
            for entity in document.entities:
                if entity.label == label:
                    lengths.add(len(entity.text.split(" ")))
        return (min(lengths), max(lengths))

    def lable_entitis(self, label: str) -> Set[str]:
        """Get a list of all entities for a given label.

        A set of entity texts from all entities with the given label is created from all documents.

        Args:
            label (str): The label the list of entities texts should be created for.

        Returns:
            Set[str]: List of all entity texts for that label.
        """
        labels_entitis = set()
        for document in self.documents:
            for entity in document.entities:
                if entity.label == label:
                    labels_entitis.add(entity.text)
        return labels_entitis

    def __str__(self) -> str:
        return f"Label-Studio Export from '{self.file_path}' with {len(self.documents)} documents."

    def __repr__(self) -> str:
        return f"<Export filename: {self.file_path}, num_documents: {len(self.documents)}>"

label()

Get a set of all labels that were assigned in Label Studio.

Returns:

Type Description
Set[str]

Set[str]: Set off all different labels in the export.

Source code in LabelstudioToFonduer/to_fonduer.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def label(self) -> Set[str]:
    """Get a set of all labels that were assigned in Label Studio.

    Returns:
        Set[str]: Set off all different labels in the export.
    """
    labels = set()
    for document in self.documents:
        for entity in document.entities:
            labels.add(entity.label)
    return labels

ngrams(label)

Determine the ngarm size of a label.

All entities in all documents are split on whitespaces. The length of the shortest and longest entity is reported as ngram size.

Extra padding needed

Fonduer might split sentences differently. Therefore, special characters not labeled might also count as tokens. Therefore, a slightly longer ngram size might be needed to account for these tokens.

Parameters:

Name Type Description Default
label str

The label the ngram size should be determined for.

required

Returns:

Type Description
Tuple[int, int]

Tuple[int, int]: A tuple of the minimal and maximal ngram size.

Source code in LabelstudioToFonduer/to_fonduer.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def ngrams(self, label: str) -> Tuple[int, int]:
    """Determine the ngarm size of a label.

    All entities in all documents are split on whitespaces. The length of the shortest and
    longest entity is reported as ngram size.

    ???+ warning "Extra padding needed"

        Fonduer might split sentences differently. Therefore, special characters not
        labeled might also count as tokens. Therefore, a slightly longer ngram size might be
        needed to account for these tokens.

    Args:
        label (str): The label the ngram size should be determined for.

    Returns:
        Tuple[int, int]: A tuple of the minimal and maximal ngram size.
    """
    lengths = set()
    for document in self.documents:
        for entity in document.entities:
            if entity.label == label:
                lengths.add(len(entity.text.split(" ")))
    return (min(lengths), max(lengths))

lable_entitis(label)

Get a list of all entities for a given label.

A set of entity texts from all entities with the given label is created from all documents.

Parameters:

Name Type Description Default
label str

The label the list of entities texts should be created for.

required

Returns:

Type Description
Set[str]

Set[str]: List of all entity texts for that label.

Source code in LabelstudioToFonduer/to_fonduer.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def lable_entitis(self, label: str) -> Set[str]:
    """Get a list of all entities for a given label.

    A set of entity texts from all entities with the given label is created from all documents.

    Args:
        label (str): The label the list of entities texts should be created for.

    Returns:
        Set[str]: List of all entity texts for that label.
    """
    labels_entitis = set()
    for document in self.documents:
        for entity in document.entities:
            if entity.label == label:
                labels_entitis.add(entity.text)
    return labels_entitis

parse_export(label_studio_export_path)

Parse a Label Studio export JSON file into an Export object. The parser extracts documents and entities and the relevant features to match label studio annotations with fonduer annotations.

Parameters:

Name Type Description Default
label_studio_export_path str

Path to the export.json file exported from label studio.

required

Returns:

Name Type Description
LabelStudioExport LabelStudioExport

Export object with all necessary information at hand.

Source code in LabelstudioToFonduer/to_fonduer.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def parse_export(label_studio_export_path: str) -> LabelStudioExport:
    """Parse a Label Studio export JSON file into an Export object. The parser extracts documents
    and entities and the relevant features to match label studio annotations with fonduer
    annotations.

    Args:
        label_studio_export_path (str): Path to the export.json file exported from label studio.

    Returns:
        LabelStudioExport: Export object with all necessary information at hand.
    """
    with open(label_studio_export_path, "r") as file:
        export = json.load(file)

    documents = []
    for task in export:
        entiti_list = task["annotations"][0]["result"]

        # get html key, may be different in the label-studio annotation view
        html_key = list(task["data"].keys())[0]
        html_string = task["data"][html_key]

        # get filename
        filename = "-".join(task["file_upload"].split("-")[1:])

        entities = []
        for entity in entiti_list:
            if not entity.get("value"):
                continue
            # offset
            start_offset = entity["value"]["startOffset"]
            end_offset = entity["value"]["endOffset"]

            # text
            text = entity["value"]["text"]
            label = entity["value"]["hypertextlabels"][0]

            # Check for whitespaces in the labeling and adjust the offset accordingly
            # Whitespaces will be striped later
            if text.startswith(" "):
                start_offset += 1
            if text.endswith(" "):
                end_offset -= 1

            # XPath
            xpath = entity["value"]["start"]

            entities.append(
                LabelStudioEntity(start_offset, end_offset, text.strip(), label, xpath, filename)
            )

        document = LabelStudioDocument(filename=filename, entities=entities, html=html_string)
        documents.append(document)
    return LabelStudioExport(documents=documents, file_path=label_studio_export_path)