Skip to content

To Label Studio

Convert Fonduer candidates to Label Studio annotations.

ToLabelStudio

Transfere Fonduer candidates to Label Studio Labes.

Source code in LabelstudioToFonduer/to_label_studio.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
class ToLabelStudio:
    """Transfere Fonduer candidates to Label Studio Labes."""
    def get_offsets(self, span: str, sentence: str) -> List[Tuple[int, int]]:
        """Get all occurences of a span in a sentence and return the offset

        Args:
            span (str): Span to be matched.
            sentence (str): Sentence to be searched.

        Returns:
            List[Tuple[int, int]]: List of offsets.
        """
        span_escaped = re.escape(span)
        result = re.finditer(span_escaped, sentence)
        offsets = []
        for match in result:
            offsets.append((match.start(), match.end()))
        return offsets

    def seriealize_relation(self, relation: Any, confidence: float = 0.00) -> List[Dict[str, Any]]:
        """Serialize a Fonduer candidate relation into a Label Studio annotated relation of entities.

        The Fonduer candidate is parsed and modified to create a Label Studio annotation from it.
        First the Relation is created and then all enteties are added. The offset is modified to
        match the HTML element span, the XPath is modifies to use the body element as root.

        Args:
            relation (Any): Fonduer candidate relation.
            confidence (Float): confidence score of the relation. Defaults to 0.00.

        Returns:
            List[Dict[str, Any]]: List of serialized relations and entities.
        """

        def calculate_offset_plus(
            fd_span: str, html_string: str, xpath: str, offset_start: int
        ) -> int:
            """Calculate addititional offset from the html element.

            Fondue calculates the offset based on the sentence only. If the HTML element located by
            the XPath may contain further sentences and the HTML tag may also contain training
            whitespaces, an additional offset may be needed.
            This offset is calculated by searching the Fonduer Span in the HTML element and
            subtracting the offset from fonuer.

            Args:
                fd_span (str): Text of the entity from Fonduer.
                html_string (str): Full HTML string to construct the DOM and search the XPath.
                xpath (str): XPath to the HTML element with the span init.
                offset_start (int): Fonduer offset.

            Returns:
                int: Additional character offset.
            """
            dom = lxml.etree.ElementTree(lxml.html.fromstring(html_string))  # Create the dom
            results = dom.xpath(xpath)

            if len(results) > 1:
                logger.warning(
                    "More than one element found for XPath: '%s'. Using first element.", xpath
                )

            if results:
                html_span = results[0].text_content()
                # if string is multiple times in the context, the occurence with the closest offset
                # to the offset from Fonduer is used
                matches = self.get_offsets(fd_span, html_span)

                if len(matches) > 1:
                    candidates = []
                    for candidate in matches:
                        candidates.append(abs(candidate[0] - offset_start))
                    index = candidates.index(min(candidates))
                    offset_plus = matches[index][0]
                elif len(matches) == 1:
                    offset_plus = matches[0][0]

                elif len(matches) == 0:
                    raise ValueError("Span not found in sentence")

                if offset_plus < 1:
                    logger.warning("Offset is smaller than 1")
                    return 0
                return offset_plus - offset_start

            else:
                logger.warning("No span found from XPath")
                return 0

        # html text
        html_document = relation.document.text
        results_section = []

        # add relation
        results_section.append(
            {
                "from_id": relation[0].id,
                "to_id": relation[1].id,
                "type": "relation",
                "direction": "right",
                "readonly": False,
            }
        )

        # add enteties
        for entity in relation:
            span_mention = entity[0]

            xpath_start = span_mention.sentence.xpath
            xpath_end = span_mention.sentence.xpath

            text = span_mention.get_span()

            offset_plus = calculate_offset_plus(
                text,
                html_document,
                xpath_start,
                span_mention.char_start,
            )

            offset_start = span_mention.char_start + offset_plus
            offset_end = span_mention.char_end + offset_plus + 1

            # Logging
            dom = lxml.etree.ElementTree(lxml.html.fromstring(html_document))
            results = dom.xpath(xpath_start)
            name = relation.document.name

            if results:
                html_span = results[0].text_content()  # type: ignore

                logger.info(f"Doc: '{name}' XPath: '{xpath_start}'")
                logger.info(
                    f"Doc: '{name}' Offset_start: '{offset_start-offset_plus}' Offset_end: '{offset_end-offset_plus}'"
                )
                logger.info(f"Doc: '{name}' Plus: '{offset_plus}'")
                logger.info(f"Doc: '{name}' Span: '{name}'")
                logger.info(f"Doc: '{name}' Raw: '{repr(html_span)}'")
                logger.info(
                    f"Doc: '{name}' Marked: '{highlight_span(html_span, offset_start, offset_end)}'"
                )

            else:
                logger.warning(f"Doc: '{name}' No span found from XPath")

            result = {
                "id": entity.id,
                "from_name": "ner",
                "to_name": "text",
                "type": "hypertextlabels",
                "readonly": False,
                "hidden": False,
                "score": confidence,
                "value": {
                    "start": xpath_start.replace("/html/body", ""),
                    "end": xpath_end.replace("/html/body", ""),
                    "startOffset": offset_start,
                    "endOffset": offset_end,
                    "text": text,
                    "hypertextlabels": [entity.type[0].upper() + entity.type[1:]],
                },
            }
            results_section.append(result)

        return results_section

    def create_export(
        self, candidates: List[Any], fonduer_export_path: str = ""
    ) -> Union[str, List[Any]]:
        """Create a Label Studio import file from Fonduer candidates.

        Args:
            candidates (List[Any]): Candidates from Fonduer.
            fonduer_export_path (str, optional): Desired location for the Label Studio import file. Defaults to "".

        Returns:
            Union[str, List[Any]]: Import dictionary or the
            path where the import file is saved.
        """
        documents = {}

        for relation in candidates[0]:  # serialize all candidates
            if relation.document.name not in documents:  # Goup by documents
                # Create base document
                documents[relation.document.name] = {
                    "id": relation.document.name,
                    "data": {
                        "text": relation.document.text,
                    },
                    "annotations": [{"model_version": 0, "score": 0, "result": []}],
                }

            # Add relations of document
            documents[relation.document.name]["annotations"][0]["result"].extend(
                self.seriealize_relation(relation)
            )

        fonduer_export = list(documents.values())

        if fonduer_export_path:
            with open(fonduer_export_path, "w") as file:
                json.dump(fonduer_export, file)
            return fonduer_export_path
        else:
            return fonduer_export

get_offsets(span, sentence)

Get all occurences of a span in a sentence and return the offset

Parameters:

Name Type Description Default
span str

Span to be matched.

required
sentence str

Sentence to be searched.

required

Returns:

Type Description
List[Tuple[int, int]]

List[Tuple[int, int]]: List of offsets.

Source code in LabelstudioToFonduer/to_label_studio.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def get_offsets(self, span: str, sentence: str) -> List[Tuple[int, int]]:
    """Get all occurences of a span in a sentence and return the offset

    Args:
        span (str): Span to be matched.
        sentence (str): Sentence to be searched.

    Returns:
        List[Tuple[int, int]]: List of offsets.
    """
    span_escaped = re.escape(span)
    result = re.finditer(span_escaped, sentence)
    offsets = []
    for match in result:
        offsets.append((match.start(), match.end()))
    return offsets

seriealize_relation(relation, confidence=0.0)

Serialize a Fonduer candidate relation into a Label Studio annotated relation of entities.

The Fonduer candidate is parsed and modified to create a Label Studio annotation from it. First the Relation is created and then all enteties are added. The offset is modified to match the HTML element span, the XPath is modifies to use the body element as root.

Parameters:

Name Type Description Default
relation Any

Fonduer candidate relation.

required
confidence Float

confidence score of the relation. Defaults to 0.00.

0.0

Returns:

Type Description
List[Dict[str, Any]]

List[Dict[str, Any]]: List of serialized relations and entities.

Source code in LabelstudioToFonduer/to_label_studio.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def seriealize_relation(self, relation: Any, confidence: float = 0.00) -> List[Dict[str, Any]]:
    """Serialize a Fonduer candidate relation into a Label Studio annotated relation of entities.

    The Fonduer candidate is parsed and modified to create a Label Studio annotation from it.
    First the Relation is created and then all enteties are added. The offset is modified to
    match the HTML element span, the XPath is modifies to use the body element as root.

    Args:
        relation (Any): Fonduer candidate relation.
        confidence (Float): confidence score of the relation. Defaults to 0.00.

    Returns:
        List[Dict[str, Any]]: List of serialized relations and entities.
    """

    def calculate_offset_plus(
        fd_span: str, html_string: str, xpath: str, offset_start: int
    ) -> int:
        """Calculate addititional offset from the html element.

        Fondue calculates the offset based on the sentence only. If the HTML element located by
        the XPath may contain further sentences and the HTML tag may also contain training
        whitespaces, an additional offset may be needed.
        This offset is calculated by searching the Fonduer Span in the HTML element and
        subtracting the offset from fonuer.

        Args:
            fd_span (str): Text of the entity from Fonduer.
            html_string (str): Full HTML string to construct the DOM and search the XPath.
            xpath (str): XPath to the HTML element with the span init.
            offset_start (int): Fonduer offset.

        Returns:
            int: Additional character offset.
        """
        dom = lxml.etree.ElementTree(lxml.html.fromstring(html_string))  # Create the dom
        results = dom.xpath(xpath)

        if len(results) > 1:
            logger.warning(
                "More than one element found for XPath: '%s'. Using first element.", xpath
            )

        if results:
            html_span = results[0].text_content()
            # if string is multiple times in the context, the occurence with the closest offset
            # to the offset from Fonduer is used
            matches = self.get_offsets(fd_span, html_span)

            if len(matches) > 1:
                candidates = []
                for candidate in matches:
                    candidates.append(abs(candidate[0] - offset_start))
                index = candidates.index(min(candidates))
                offset_plus = matches[index][0]
            elif len(matches) == 1:
                offset_plus = matches[0][0]

            elif len(matches) == 0:
                raise ValueError("Span not found in sentence")

            if offset_plus < 1:
                logger.warning("Offset is smaller than 1")
                return 0
            return offset_plus - offset_start

        else:
            logger.warning("No span found from XPath")
            return 0

    # html text
    html_document = relation.document.text
    results_section = []

    # add relation
    results_section.append(
        {
            "from_id": relation[0].id,
            "to_id": relation[1].id,
            "type": "relation",
            "direction": "right",
            "readonly": False,
        }
    )

    # add enteties
    for entity in relation:
        span_mention = entity[0]

        xpath_start = span_mention.sentence.xpath
        xpath_end = span_mention.sentence.xpath

        text = span_mention.get_span()

        offset_plus = calculate_offset_plus(
            text,
            html_document,
            xpath_start,
            span_mention.char_start,
        )

        offset_start = span_mention.char_start + offset_plus
        offset_end = span_mention.char_end + offset_plus + 1

        # Logging
        dom = lxml.etree.ElementTree(lxml.html.fromstring(html_document))
        results = dom.xpath(xpath_start)
        name = relation.document.name

        if results:
            html_span = results[0].text_content()  # type: ignore

            logger.info(f"Doc: '{name}' XPath: '{xpath_start}'")
            logger.info(
                f"Doc: '{name}' Offset_start: '{offset_start-offset_plus}' Offset_end: '{offset_end-offset_plus}'"
            )
            logger.info(f"Doc: '{name}' Plus: '{offset_plus}'")
            logger.info(f"Doc: '{name}' Span: '{name}'")
            logger.info(f"Doc: '{name}' Raw: '{repr(html_span)}'")
            logger.info(
                f"Doc: '{name}' Marked: '{highlight_span(html_span, offset_start, offset_end)}'"
            )

        else:
            logger.warning(f"Doc: '{name}' No span found from XPath")

        result = {
            "id": entity.id,
            "from_name": "ner",
            "to_name": "text",
            "type": "hypertextlabels",
            "readonly": False,
            "hidden": False,
            "score": confidence,
            "value": {
                "start": xpath_start.replace("/html/body", ""),
                "end": xpath_end.replace("/html/body", ""),
                "startOffset": offset_start,
                "endOffset": offset_end,
                "text": text,
                "hypertextlabels": [entity.type[0].upper() + entity.type[1:]],
            },
        }
        results_section.append(result)

    return results_section

create_export(candidates, fonduer_export_path='')

Create a Label Studio import file from Fonduer candidates.

Parameters:

Name Type Description Default
candidates List[Any]

Candidates from Fonduer.

required
fonduer_export_path str

Desired location for the Label Studio import file. Defaults to "".

''

Returns:

Type Description
Union[str, List[Any]]

Union[str, List[Any]]: Import dictionary or the

Union[str, List[Any]]

path where the import file is saved.

Source code in LabelstudioToFonduer/to_label_studio.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def create_export(
    self, candidates: List[Any], fonduer_export_path: str = ""
) -> Union[str, List[Any]]:
    """Create a Label Studio import file from Fonduer candidates.

    Args:
        candidates (List[Any]): Candidates from Fonduer.
        fonduer_export_path (str, optional): Desired location for the Label Studio import file. Defaults to "".

    Returns:
        Union[str, List[Any]]: Import dictionary or the
        path where the import file is saved.
    """
    documents = {}

    for relation in candidates[0]:  # serialize all candidates
        if relation.document.name not in documents:  # Goup by documents
            # Create base document
            documents[relation.document.name] = {
                "id": relation.document.name,
                "data": {
                    "text": relation.document.text,
                },
                "annotations": [{"model_version": 0, "score": 0, "result": []}],
            }

        # Add relations of document
        documents[relation.document.name]["annotations"][0]["result"].extend(
            self.seriealize_relation(relation)
        )

    fonduer_export = list(documents.values())

    if fonduer_export_path:
        with open(fonduer_export_path, "w") as file:
            json.dump(fonduer_export, file)
        return fonduer_export_path
    else:
        return fonduer_export