Skip to content

Teanga Document Module

CharacterLayer

Bases: str, Layer

Source code in teanga/document.py
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
class CharacterLayer(str, Layer):
    @property
    def data(self):
        """
        Return the data values of the layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters")})
            >>> doc["text"] = "This"
            >>> doc["text"].data
            [None, None, None, None]
        """
        return [None] * len(self)

    @property
    def raw(self):
        return str(self)

    @property
    def text(self):
        """
        Return the underlying text grouped by the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters")})
            >>> doc["text"] = "This is a document."
            >>> doc["text"].text
            ['This is a document.']
        """
        return [str(self)]

    def indexes(self, layer:str):
        """
        Return the indexes of the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters")})
            >>> doc["text"] = "This"
            >>> doc["text"].indexes("text")
            [(0, 1), (1, 2), (2, 3), (3, 4)]
        """
        return list(zip(range(len(self)), range(1, len(self) + 1)))

    def matches(self, value: Union[str,list,dict]) -> Iterator[int]:
        """Return the indexes of the annotations that match the given value.

        Parameters:
            value: Union[str,list,dict]
                The value to match as described in the `view` method of
                the `Corpus` class.
        """
        if isinstance(value, str):
            if value == self:
                return [0]
            else:
                return []
        elif isinstance(value, list):
            if any(v == self for v in value):
                return [0]
            else:
                return []
        elif isinstance(value, dict):
            if any(k.startswith("$text") for k in value):
                if all(_key_match(None, str(self), k, v) for k, v in value.items()):
                    return [0]
                else:
                    return []
        else:
            raise Exception("Bad value: " + repr(value))

    def transform(self, transform_func):# -> Self:
        return CharacterLayer(transform_func(str(self)))

data property

Return the data values of the layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters")})
>>> doc["text"] = "This"
>>> doc["text"].data
[None, None, None, None]

text property

Return the underlying text grouped by the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters")})
>>> doc["text"] = "This is a document."
>>> doc["text"].text
['This is a document.']

indexes(layer)

Return the indexes of the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters")})
>>> doc["text"] = "This"
>>> doc["text"].indexes("text")
[(0, 1), (1, 2), (2, 3), (3, 4)]
Source code in teanga/document.py
650
651
652
653
654
655
656
657
658
659
660
def indexes(self, layer:str):
    """
    Return the indexes of the annotations of this layer.

    Examples:
        >>> doc = Document({"text": LayerDesc(layer_type="characters")})
        >>> doc["text"] = "This"
        >>> doc["text"].indexes("text")
        [(0, 1), (1, 2), (2, 3), (3, 4)]
    """
    return list(zip(range(len(self)), range(1, len(self) + 1)))

matches(value)

Return the indexes of the annotations that match the given value.

Parameters:

Name Type Description Default
value Union[str, list, dict]

Union[str,list,dict] The value to match as described in the view method of the Corpus class.

required
Source code in teanga/document.py
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
def matches(self, value: Union[str,list,dict]) -> Iterator[int]:
    """Return the indexes of the annotations that match the given value.

    Parameters:
        value: Union[str,list,dict]
            The value to match as described in the `view` method of
            the `Corpus` class.
    """
    if isinstance(value, str):
        if value == self:
            return [0]
        else:
            return []
    elif isinstance(value, list):
        if any(v == self for v in value):
            return [0]
        else:
            return []
    elif isinstance(value, dict):
        if any(k.startswith("$text") for k in value):
            if all(_key_match(None, str(self), k, v) for k, v in value.items()):
                return [0]
            else:
                return []
    else:
        raise Exception("Bad value: " + repr(value))

DataLayer

Bases: Layer

Any non-character layer of annotation

Source code in teanga/document.py
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
class DataLayer(Layer):
    """Any non-character layer of annotation"""

    def __init__(self, name:str, doc:Document):
        self._name = name
        self._meta = doc.meta[name]
        self._doc = doc

    def root_layer(self) -> str:
        """Return the name of the root layer of the layer.

        Returns:
            str: The name of the root layer of this layer.
        """
        if self._meta.base is None:
            return self._name
        else:
            base_layer = self._doc.layers[self._meta.base]
            if isinstance(base_layer, CharacterLayer):
                return self._meta.base
            else:
                return self._doc.layers[self._meta.base].root_layer()

    def matches(self, value: Union[str,list,dict]) -> Iterator[int]:
        """Return the indexes of the annotations that match the given value.

        Parameters:
            value: Union[str,list,dict]
                The value to match as described in the `view` method of
                the `Corpus` class.
        """
        if isinstance(value, str):
            if self._meta.data is None:
                return (i for i, x in enumerate(self.text) if x == value)
            else:
                return (i for i, x in enumerate(self.data) if x == value)
        elif isinstance(value, list):
            if self._meta.data is None:
                return (i for i, x in enumerate(self.text) if x in value)
            else:
                return (i for i, x in enumerate(self.data) if x in value)
        elif isinstance(value, dict):
            if any(k.startswith("$text") for k in value):
                return (i for i, (d, t) in enumerate(zip(self.data, self.text))
                        if all(_key_match(d, t, k, v) for k, v in value.items()))
            else:
                return (i for i, d in enumerate(self.data)
                        if all(_key_match(d, None, k, v) for k, v in value.items()))
        else:
            raise Exception("Bad value: " + repr(value))

    def __len__(self):
        return len(self._data)

    def __getitem__(self, key):
        """Return the annotation with the given index."""
        return self.raw[key]

    def __iter__(self):
        """Return an iterator over the annotations of the layer."""
        return iter(self.raw)

    def __contains__(self, item):
        """Return whether the item is in the layer."""
        return item in self.raw

    def __repr__(self):
        """Return a string representation of the layer."""
        return f"{self.__class__.__name__}({self._name}, {self._doc.id}, {self.raw})"

    def __eq__(self, other):
        """Return whether the layer is equal to another layer."""
        if isinstance(other, list):
            return self.raw == other
        elif not isinstance(other, DataLayer):
            return False
        return (self._name == other._name and
                self.raw == other.raw)

__contains__(item)

Return whether the item is in the layer.

Source code in teanga/document.py
602
603
604
def __contains__(self, item):
    """Return whether the item is in the layer."""
    return item in self.raw

__eq__(other)

Return whether the layer is equal to another layer.

Source code in teanga/document.py
610
611
612
613
614
615
616
617
def __eq__(self, other):
    """Return whether the layer is equal to another layer."""
    if isinstance(other, list):
        return self.raw == other
    elif not isinstance(other, DataLayer):
        return False
    return (self._name == other._name and
            self.raw == other.raw)

__getitem__(key)

Return the annotation with the given index.

Source code in teanga/document.py
594
595
596
def __getitem__(self, key):
    """Return the annotation with the given index."""
    return self.raw[key]

__iter__()

Return an iterator over the annotations of the layer.

Source code in teanga/document.py
598
599
600
def __iter__(self):
    """Return an iterator over the annotations of the layer."""
    return iter(self.raw)

__repr__()

Return a string representation of the layer.

Source code in teanga/document.py
606
607
608
def __repr__(self):
    """Return a string representation of the layer."""
    return f"{self.__class__.__name__}({self._name}, {self._doc.id}, {self.raw})"

matches(value)

Return the indexes of the annotations that match the given value.

Parameters:

Name Type Description Default
value Union[str, list, dict]

Union[str,list,dict] The value to match as described in the view method of the Corpus class.

required
Source code in teanga/document.py
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
def matches(self, value: Union[str,list,dict]) -> Iterator[int]:
    """Return the indexes of the annotations that match the given value.

    Parameters:
        value: Union[str,list,dict]
            The value to match as described in the `view` method of
            the `Corpus` class.
    """
    if isinstance(value, str):
        if self._meta.data is None:
            return (i for i, x in enumerate(self.text) if x == value)
        else:
            return (i for i, x in enumerate(self.data) if x == value)
    elif isinstance(value, list):
        if self._meta.data is None:
            return (i for i, x in enumerate(self.text) if x in value)
        else:
            return (i for i, x in enumerate(self.data) if x in value)
    elif isinstance(value, dict):
        if any(k.startswith("$text") for k in value):
            return (i for i, (d, t) in enumerate(zip(self.data, self.text))
                    if all(_key_match(d, t, k, v) for k, v in value.items()))
        else:
            return (i for i, d in enumerate(self.data)
                    if all(_key_match(d, None, k, v) for k, v in value.items()))
    else:
        raise Exception("Bad value: " + repr(value))

root_layer()

Return the name of the root layer of the layer.

Returns:

Name Type Description
str str

The name of the root layer of this layer.

Source code in teanga/document.py
548
549
550
551
552
553
554
555
556
557
558
559
560
561
def root_layer(self) -> str:
    """Return the name of the root layer of the layer.

    Returns:
        str: The name of the root layer of this layer.
    """
    if self._meta.base is None:
        return self._name
    else:
        base_layer = self._doc.layers[self._meta.base]
        if isinstance(base_layer, CharacterLayer):
            return self._meta.base
        else:
            return self._doc.layers[self._meta.base].root_layer()

DivLayer

Bases: StandoffLayer

A layer where the sublayer is divided into non-overlapping parts. As such these layers have only a start index for each annotation, and that annotation spans until the next annotation

Source code in teanga/document.py
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
class DivLayer(StandoffLayer):
    """A layer where the sublayer is divided into non-overlapping parts.
    As such these layers have only a start index for each annotation, and that
    annotation spans until the next annotation"""

    def __init__(self, name:str, doc:Document, spans:list):
        super().__init__(name, doc)
        self._data = spans
        for span in self._data:
            if (not isinstance(span, numbers.Integral) and
                not isinstance(span[0], numbers.Integral)):
                raise Exception("Bad span data: " + repr(span))

    @property
    def data(self):
        """
        Return the data values of the layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "sentences": LayerDesc(layer_type="div", base="text")},
            ... text="This is an example. This is another example.")
            >>> doc["sentences"] = [0, 19]
            >>> doc["sentences"].data
            [None, None]
        """
        if self._meta.data is None:
            return [None] * len(self._data)
        elif self._meta.data == "link" and self._meta.link_types:
            return [(s[1], s[2]) for s in self._data]
        else:
            return [s[1] for s in self._data]

    def indexes(self, layer:str):
        """
        Return the indexes of the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "sentences": LayerDesc(layer_type="div", base="text")},
            ... text="This is an example. This is another example.")
            >>> doc["sentences"] = [0, 19]
            >>> doc["sentences"].indexes("sentences")
            [(0, 1), (1, 2)]
            >>> doc["sentences"].indexes("text")
            [(0, 19), (19, 44)]
        """
        if layer == self._name:
            return list(zip(range(len(self._data)), range(1, len(self._data) + 1)))
        elif layer == self._meta.base:
            return list(pairwise(chain((s for s in self._data),
                                  [len(self._doc.layers[self._meta.base])])))
        else:
            subindexes = list(self._doc.layers[self._meta.base].indexes(layer))
            return list(pairwise(
                chain(
                    (subindexes[s][0] for s in self._data),
                    [len(self._doc.layers[layer])])))

    def __repr__(self):
        return "DivLayer(" + repr(self._data) + ")"

    def transform(self, transform_func):# -> Self:
        return DivLayer(self._name, self._doc, [transform_func(x) for x in self._data])

data property

Return the data values of the layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "sentences": LayerDesc(layer_type="div", base="text")},
... text="This is an example. This is another example.")
>>> doc["sentences"] = [0, 19]
>>> doc["sentences"].data
[None, None]

indexes(layer)

Return the indexes of the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "sentences": LayerDesc(layer_type="div", base="text")},
... text="This is an example. This is another example.")
>>> doc["sentences"] = [0, 19]
>>> doc["sentences"].indexes("sentences")
[(0, 1), (1, 2)]
>>> doc["sentences"].indexes("text")
[(0, 19), (19, 44)]
Source code in teanga/document.py
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
def indexes(self, layer:str):
    """
    Return the indexes of the annotations of this layer.

    Examples:
        >>> doc = Document({"text": LayerDesc(layer_type="characters"),
        ... "sentences": LayerDesc(layer_type="div", base="text")},
        ... text="This is an example. This is another example.")
        >>> doc["sentences"] = [0, 19]
        >>> doc["sentences"].indexes("sentences")
        [(0, 1), (1, 2)]
        >>> doc["sentences"].indexes("text")
        [(0, 19), (19, 44)]
    """
    if layer == self._name:
        return list(zip(range(len(self._data)), range(1, len(self._data) + 1)))
    elif layer == self._meta.base:
        return list(pairwise(chain((s for s in self._data),
                              [len(self._doc.layers[self._meta.base])])))
    else:
        subindexes = list(self._doc.layers[self._meta.base].indexes(layer))
        return list(pairwise(
            chain(
                (subindexes[s][0] for s in self._data),
                [len(self._doc.layers[layer])])))

Document

Document class for storing and processing text data.

Source code in teanga/document.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
class Document:
    """Document class for storing and processing text data.

    """
    def __init__(self, meta:dict[str,Union[LayerDesc,dict]],
                 _pyo3=None, id=None, corpus_ref=None, **kwargs):
        self._meta = meta
        self.layers = {}
        self._pyo3 = None
        self.id = None
        self._corpus_ref = corpus_ref
        self.add_layers({key: value
                         for key, value in kwargs.items()
                         if not key.startswith("_")})
        self._metadata = {key[1:]: value 
                         for key, value in kwargs.items()
                         if key.startswith("_")}
        self.id = id
        self._pyo3 = _pyo3

    def copy(self):
        """Return a copy of the document."""
        return Document(self._meta, self._pyo3, self.id,
                        **{key: value for key, value in self.layers.items()})

    @deprecated(reason="Use __setitem__ instead, e.g., doc['text'] = \
'This is a document.'")
    def add_layer(self, name:str, value : Union[str,list,'Layer']) -> 'Layer':
        self[name] = value

    def __setitem__(self, name:str, value : Union[str,list,'Layer']) -> 'Layer':
        """Add or set a layer to the document.

        Parameters:
            name: str
                Name of the layer.
            value: str
                Value of the layer, a single string or
                a list of values that are suitable for the
                Teanga layer type or a Layer object.

        Examples:
            >>> from teanga import Corpus
            >>> corpus = Corpus()
            >>> corpus.add_layer_meta("text")
            >>> corpus.add_layer_meta("words", layer_type="span", base="text")
            >>> corpus.add_layer_meta("pos", layer_type="seq", base="words", data="string")
            >>> doc = corpus.add_doc("This is a document.")
            >>> doc["words"] = [(0,4), (5,7), (8,9), (10,18), (18,19)]
            >>> doc["pos"] = ["DT", "VBZ", "DT", "NN", "."]
            >>> doc
            Document('Kjco', {'text': 'This is a document.', \
'words': SpanLayer([[0, 4], [5, 7], [8, 9], [10, 18], [18, 19]]), \
'pos': SeqLayer(['DT', 'VBZ', 'DT', 'NN', '.'])})
            >>> corpus.doc_by_id("Kjco")
            Document('Kjco', {'text': 'This is a document.', \
'words': SpanLayer([[0, 4], [5, 7], [8, 9], [10, 18], [18, 19]]), \
'pos': SeqLayer(['DT', 'VBZ', 'DT', 'NN', '.'])})

        """
        if name.startswith("_"):
            self._metadata[name[1:]] = value
            return
        if name not in self._meta:
            raise Exception("Layer with name " + name + " does not exist.")
        if value is None and self._meta[name].default is not None:
            value = self._meta[name].default
        if isinstance(value, Layer):
            self.layers[name] = value
            return value
        if self._meta[name].layer_type is None:
            raise Exception("Layer " + name + " has no layer type.")
        if self._meta[name].layer_type not in ["characters", "seq", "span", "div", "element"]:
            raise Exception("Invalid layer type " + self._meta[name].layer_type)
        if self._meta[name].layer_type == "characters":
            if self.id and not self._corpus_ref:
                raise Exception("Cannot add character layer to existing document.")
            elif self.id and self._corpus_ref:
                old_id = self.id
                self.layers[name] = CharacterLayer(str(value))
                self.id = self._corpus_ref.update_doc(old_id, self)
            else:
                self.layers[name] = CharacterLayer(str(value))
        elif self._meta[name].base is None:
            raise Exception("Non-character layer " + name + " must have a base.")
        elif (self._meta[name].base not in self._meta):
            raise Exception("Layer refers to non-existent base layer: " +
                str(self._meta[name].base))
        elif (self._meta[name].base not in self.layers and
                self._meta[self._meta[name].base].default is None):
            raise Exception("Cannot add layer " + name + " because sublayer " +
            self._meta[name].base + " does not exist.")
        elif self._meta[name].layer_type == "seq":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 0) for v in value]
            if self._meta[name].base in self.layers:
                base_layer_len = len(self.layers[self._meta[name].base])
            elif self._meta[self._meta[name].base].default is not None:
                base_layer_len = len(self._meta[self._meta[name].base].default)
            else:
                raise Exception("Cannot add layer " + name + " because sublayer " +
                    self._meta[name].base + " does not exist.")
            if len(value) != base_layer_len:
                raise Exception("Value of layer " + name + " must have the " +
                "same length as layer " + self._meta[name].base + ".")
            self.layers[name] = SeqLayer(name, self, value)
        elif self._meta[name].layer_type == "span":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 2) for v in value]
            self.layers[name] = SpanLayer(name, self, value)
        elif self._meta[name].layer_type == "div":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 1) for v in value]
            self.layers[name] = DivLayer(name, self, value)
        elif self._meta[name].layer_type == "element":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 1) for v in value]
            self.layers[name] = ElementLayer(name, self, value)
        else:
            raise Exception("Unknown layer type " + self._meta[name].layer_type +
            " for layer " + name + ".")
        if self._pyo3 and self.id:
            data_fields = {name: layer.raw
                           for (name,layer) in self.layers.items()}
            self._pyo3.update_doc(self.id, data_fields)

        return self.layers[name]

    def __getattr__(self, name:str) -> 'Layer':
        """Return the layer with the given name."""
        if name.startswith("_"):
            if name[1:] in self._metadata:
                return self._metadata[name[1:]]
            else:
                raise AttributeError("No such metadata: " + name)
        if name in self.layers:
            return self.layers[name]
        else:
            raise AttributeError("No such layer: " + name)

    def __setattr__(self, name:str, value) -> None:
        """Set the value of a layer."""
        if name != "layers" and name != "_meta" and name != "_pyo3" and name != "id" and name != "_metadata" and name != "_corpus_ref":
            self.__setitem__(name, value)
        else:
            super().__setattr__(name, value)

    def add_layers(self, layers:dict):
        """Add multiple layers in one go.

        Parameters:
            layers: dict
                A dictionary of layer names and values.

        Examples:
            >>> from teanga import Corpus
            >>> corpus = Corpus()
            >>> corpus.add_layer_meta("text")
            >>> corpus.add_layer_meta("words", layer_type="span", base="text")
            >>> corpus.add_layer_meta("pos", layer_type="seq", base="words", data="string")
            >>> doc = corpus.add_doc("This is a document.")
            >>> doc.add_layers({"words": [(0,4), (5,7), (8,9), (10,18), (18,19)], \
    "pos": ["DT", "VBZ", "DT", "NN", "."]})
            """
        added = set(self.layers.keys())
        to_add = set(layers.keys())

        for layer in self._meta:
            if layer not in layers and self._meta[layer].default is not None:
                added.add(layer)

        while len(to_add) > 0:
            for name in to_add.copy():
                data = layers[name]
                if self._meta[name].base is None or self._meta[name].base in added:
                    self[name] = data
                    added.add(name)
                    to_add.remove(name)
                elif (self._meta[name].base is not None
                      and self._meta[name].base not in layers
                      and self._meta[name].base not in added):
                    raise Exception("Cannot add layer " + name + " because sublayer " +
                    self._meta[name].base + " does not exist.")

    def __getitem__(self, name:str):
        """Return the value of a layer.

        Parameters:
            name: str
                The name of the layer.
        """
        if not isinstance(name, str):
            raise TypeError(f"Layer name must be a string, not {type(name).__name__}")
        if name not in self._meta:
            raise Exception("Layer with name " + name + " does not exist.")
        return self.layers[name]

    def __iter__(self):
        """Return an iterator over the layers."""
        return iter(self.layers)

    def __contains__(self, name:str) -> bool:
        """Return whether a layer with the given name exists."""
        return name in self.layers

    @deprecated(reason="Access layers using __getitem__ instead, e.g., doc['text']")
    def get_layer(self, name:str):
        return self[name]

    def character_layers(self) -> dict[str, str]:
        """Get the character layers for this document (used to calculate the ID)"""
        return {layer: self.layers[layer].raw
                for layer in self.layers
                if self._meta[layer].layer_type == "characters"}

    @property
    def meta(self):
        return self._meta

    @property
    def metadata(self):
        """Get the dictionary of meta layers."""
        return self._metadata

    def text_for_layer(self, layer_name:str) -> Generator[None,None,str]:
        """Return the text for a layer.

        Parameters:
            layer_name: str
                The name of the layer.

        Returns:
            A generator that yields the text for the layer.

        Examples:
            >>> from teanga import Corpus
            >>> corpus = Corpus()
            >>> corpus.add_layer_meta("text")
            >>> corpus.add_layer_meta("words", layer_type="span", base="text")
            >>> corpus.add_layer_meta("pos", layer_type="seq", base="words")
            >>> doc = corpus.add_doc("This is a document.")
            >>> doc.words = [[0,4], [5,7], [8,9], [10,18], [18,19]]
            >>> doc.pos = ["DT", "VBZ", "DT", "NN", "."]
            >>> list(doc.text_for_layer("text"))
            ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 'd', 'o', \
'c', 'u', 'm', 'e', 'n', 't', '.']
            >>> list(doc.text_for_layer("words"))
            ['This', 'is', 'a', 'document', '.']
            >>> list(doc.text_for_layer("pos"))
            ['This', 'is', 'a', 'document', '.']
        """
        if layer_name not in self._meta:
            raise Exception("Layer with name " + layer_name + " does not exist.")
        if self._meta[layer_name].layer_type == "characters":
            return self.layers[layer_name].text[0]
        else:
            text_layer = layer_name
            while self._meta[text_layer].layer_type != "characters":
                text_layer = self._meta[text_layer].base
            indexes = self.layers[layer_name].indexes(text_layer)
            text = self.layers[text_layer].text[0]
            return (text[start:end]
                    for start, end in indexes)

    def view(self, *args, start: int = 0, end: int = None, root_layer: str = None):
        """Return a view of the document. A view is a grouping of the basic
          text data according to the annotations in the document

        Parameters:
        -----------

        args: list
            The layers to view the text data by.
        start: int
            The start index of the view.
        end: int
            The end index of the view.
        root_layer: str
            The root layer of the view (this should not normally be specified).

        Returns:
        --------

        The text data grouped by the annotations in the document.

        Examples:
        ---------

        >>> from teanga import Corpus
        >>> corpus = Corpus()
        >>> corpus.add_layer_meta("text")
        >>> corpus.add_layer_meta("words", layer_type="span", base="text")
        >>> corpus.add_layer_meta("sentences", layer_type="div", base="text")
        >>> doc = corpus.add_doc("This is a sentence. This is another sentence.")
        >>> doc.words = [[0,4], [5,7], [8,9], [10,18], [18,19], [20,24], \
[25,27], [28,35], [36,44], [44,45]]
        >>> doc.sentences = [0, 20]
        >>> doc.view("words")
        ['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', 'sentence', '.']
        >>> doc.view("sentences")
        ['This is a sentence. ', 'This is another sentence.']
        >>> doc.view("words", "sentences")
        [['This', 'is', 'a', 'sentence', '.'], ['This', 'is', 'another', \
'sentence', '.']]
          """
        if root_layer is None:
            for layer in args:
                if self._meta[layer].layer_type == "characters":
                    rl = layer
                else:
                    rl = self.layers[layer].root_layer()
                if root_layer is not None and rl != root_layer:
                    raise Exception("view was called with layers that have " +
                    "different root layers")
                root_layer = rl
        if root_layer is None:
            for layer in self.layers:
                if self._meta[layer].base is None:
                    if root_layer is not None:
                        raise Exception("view was called without specifying any" +
                        "layers or root layer but there are multiple root " +
                        "layers in the document")
                    root_layer = layer
                    break
        if end is None:
            end = len(self.layers["text"])
        if len(args) == 0:
            return self.text_for_layer(root_layer)[start:end]
        else:
            indexes = self.layers[args[-1]].indexes(root_layer)
            indexes = [(s, e) for s,e in indexes
                       if s >= start and e <= end]
            return [self.view(*args[:-1], start=s, end=e, root_layer=root_layer)
                    for s, e in indexes]

    def to_json(self) -> str:
        """Return the JSON representation of the document."""
        return {layer_id: self.layers[layer_id].raw
                for layer_id in self.layers.keys()}

    @staticmethod
    def from_json(json:dict, meta:dict, _pyo3=None, id=None) -> 'Document':
        """Return a document from its JSON representation."""
        doc = Document(meta, _pyo3, id)
        doc.add_layers(json)
        return doc

    def __repr__(self):
        return "Document(" + repr(self.id) + ", " + repr(self.layers) + ")"

    def __eq__(self, other):
        if not isinstance(other, Document):
            return False
        if self.id != other.id:
            return False
        if self.layers != other.layers:
            return False
        if self._metadata != other._metadata:
            return False
        return True

    def _repr_html_(self):
        """Return an HTML representation of the document."""
        s = "<h2>Document " + repr(self.id) + "</h2>"
        s += "<table>"
        s += "<tr><th>Layer</th><th>Type</th><th>Data</th></tr>"
        for layer_name, layer in self.layers.items():
            s += "<tr>"
            s += "<td>" + layer_name + "</td>"
            s += "<td>" + self._meta[layer_name].layer_type + "</td>"
            s += "<td>" + clip_string(str(self[layer_name].raw)) + "</td>"
            s += "</tr>"
        s += "</table>"
        if self._metadata:
            s += "<h3>Metadata</h3>"
            s += "<table>"
            s += "<tr><th>Key</th><th>Value</th></tr>"
            for key, value in self._metadata.items():
                s += "<tr>"
                s += "<td>" + key + "</td>"
                s += "<td>" + repr(value) + "</td>"
                s += "</tr>"
            s += "</table>"
        return s

metadata property

Get the dictionary of meta layers.

__contains__(name)

Return whether a layer with the given name exists.

Source code in teanga/document.py
216
217
218
def __contains__(self, name:str) -> bool:
    """Return whether a layer with the given name exists."""
    return name in self.layers

__getattr__(name)

Return the layer with the given name.

Source code in teanga/document.py
143
144
145
146
147
148
149
150
151
152
153
def __getattr__(self, name:str) -> 'Layer':
    """Return the layer with the given name."""
    if name.startswith("_"):
        if name[1:] in self._metadata:
            return self._metadata[name[1:]]
        else:
            raise AttributeError("No such metadata: " + name)
    if name in self.layers:
        return self.layers[name]
    else:
        raise AttributeError("No such layer: " + name)

__getitem__(name)

Return the value of a layer.

Parameters:

Name Type Description Default
name str

str The name of the layer.

required
Source code in teanga/document.py
199
200
201
202
203
204
205
206
207
208
209
210
def __getitem__(self, name:str):
    """Return the value of a layer.

    Parameters:
        name: str
            The name of the layer.
    """
    if not isinstance(name, str):
        raise TypeError(f"Layer name must be a string, not {type(name).__name__}")
    if name not in self._meta:
        raise Exception("Layer with name " + name + " does not exist.")
    return self.layers[name]

__iter__()

Return an iterator over the layers.

Source code in teanga/document.py
212
213
214
def __iter__(self):
    """Return an iterator over the layers."""
    return iter(self.layers)

__setattr__(name, value)

Set the value of a layer.

Source code in teanga/document.py
155
156
157
158
159
160
def __setattr__(self, name:str, value) -> None:
    """Set the value of a layer."""
    if name != "layers" and name != "_meta" and name != "_pyo3" and name != "id" and name != "_metadata" and name != "_corpus_ref":
        self.__setitem__(name, value)
    else:
        super().__setattr__(name, value)

__setitem__(name, value)

Add or set a layer to the document.

Parameters:

Name Type Description Default
name str

str Name of the layer.

required
value Union[str, list, Layer]

str Value of the layer, a single string or a list of values that are suitable for the Teanga layer type or a Layer object.

required

Examples:

>>> from teanga import Corpus
>>> corpus = Corpus()
>>> corpus.add_layer_meta("text")
>>> corpus.add_layer_meta("words", layer_type="span", base="text")
>>> corpus.add_layer_meta("pos", layer_type="seq", base="words", data="string")
>>> doc = corpus.add_doc("This is a document.")
>>> doc["words"] = [(0,4), (5,7), (8,9), (10,18), (18,19)]
>>> doc["pos"] = ["DT", "VBZ", "DT", "NN", "."]
>>> doc
Document('Kjco', {'text': 'This is a document.', 'words': SpanLayer([[0, 4], [5, 7], [8, 9], [10, 18], [18, 19]]), 'pos': SeqLayer(['DT', 'VBZ', 'DT', 'NN', '.'])})
>>> corpus.doc_by_id("Kjco")
Document('Kjco', {'text': 'This is a document.', 'words': SpanLayer([[0, 4], [5, 7], [8, 9], [10, 18], [18, 19]]), 'pos': SeqLayer(['DT', 'VBZ', 'DT', 'NN', '.'])})
Source code in teanga/document.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
    def __setitem__(self, name:str, value : Union[str,list,'Layer']) -> 'Layer':
        """Add or set a layer to the document.

        Parameters:
            name: str
                Name of the layer.
            value: str
                Value of the layer, a single string or
                a list of values that are suitable for the
                Teanga layer type or a Layer object.

        Examples:
            >>> from teanga import Corpus
            >>> corpus = Corpus()
            >>> corpus.add_layer_meta("text")
            >>> corpus.add_layer_meta("words", layer_type="span", base="text")
            >>> corpus.add_layer_meta("pos", layer_type="seq", base="words", data="string")
            >>> doc = corpus.add_doc("This is a document.")
            >>> doc["words"] = [(0,4), (5,7), (8,9), (10,18), (18,19)]
            >>> doc["pos"] = ["DT", "VBZ", "DT", "NN", "."]
            >>> doc
            Document('Kjco', {'text': 'This is a document.', \
'words': SpanLayer([[0, 4], [5, 7], [8, 9], [10, 18], [18, 19]]), \
'pos': SeqLayer(['DT', 'VBZ', 'DT', 'NN', '.'])})
            >>> corpus.doc_by_id("Kjco")
            Document('Kjco', {'text': 'This is a document.', \
'words': SpanLayer([[0, 4], [5, 7], [8, 9], [10, 18], [18, 19]]), \
'pos': SeqLayer(['DT', 'VBZ', 'DT', 'NN', '.'])})

        """
        if name.startswith("_"):
            self._metadata[name[1:]] = value
            return
        if name not in self._meta:
            raise Exception("Layer with name " + name + " does not exist.")
        if value is None and self._meta[name].default is not None:
            value = self._meta[name].default
        if isinstance(value, Layer):
            self.layers[name] = value
            return value
        if self._meta[name].layer_type is None:
            raise Exception("Layer " + name + " has no layer type.")
        if self._meta[name].layer_type not in ["characters", "seq", "span", "div", "element"]:
            raise Exception("Invalid layer type " + self._meta[name].layer_type)
        if self._meta[name].layer_type == "characters":
            if self.id and not self._corpus_ref:
                raise Exception("Cannot add character layer to existing document.")
            elif self.id and self._corpus_ref:
                old_id = self.id
                self.layers[name] = CharacterLayer(str(value))
                self.id = self._corpus_ref.update_doc(old_id, self)
            else:
                self.layers[name] = CharacterLayer(str(value))
        elif self._meta[name].base is None:
            raise Exception("Non-character layer " + name + " must have a base.")
        elif (self._meta[name].base not in self._meta):
            raise Exception("Layer refers to non-existent base layer: " +
                str(self._meta[name].base))
        elif (self._meta[name].base not in self.layers and
                self._meta[self._meta[name].base].default is None):
            raise Exception("Cannot add layer " + name + " because sublayer " +
            self._meta[name].base + " does not exist.")
        elif self._meta[name].layer_type == "seq":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 0) for v in value]
            if self._meta[name].base in self.layers:
                base_layer_len = len(self.layers[self._meta[name].base])
            elif self._meta[self._meta[name].base].default is not None:
                base_layer_len = len(self._meta[self._meta[name].base].default)
            else:
                raise Exception("Cannot add layer " + name + " because sublayer " +
                    self._meta[name].base + " does not exist.")
            if len(value) != base_layer_len:
                raise Exception("Value of layer " + name + " must have the " +
                "same length as layer " + self._meta[name].base + ".")
            self.layers[name] = SeqLayer(name, self, value)
        elif self._meta[name].layer_type == "span":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 2) for v in value]
            self.layers[name] = SpanLayer(name, self, value)
        elif self._meta[name].layer_type == "div":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 1) for v in value]
            self.layers[name] = DivLayer(name, self, value)
        elif self._meta[name].layer_type == "element":
            if not isinstance(value, list):
                raise Exception("Value of layer " + name + " must be a list.")
            value = [validate_value(v, 1) for v in value]
            self.layers[name] = ElementLayer(name, self, value)
        else:
            raise Exception("Unknown layer type " + self._meta[name].layer_type +
            " for layer " + name + ".")
        if self._pyo3 and self.id:
            data_fields = {name: layer.raw
                           for (name,layer) in self.layers.items()}
            self._pyo3.update_doc(self.id, data_fields)

        return self.layers[name]

add_layers(layers)

Add multiple layers in one go.

Parameters:

Name Type Description Default
layers dict

dict A dictionary of layer names and values.

required

Examples:

>>> from teanga import Corpus
>>> corpus = Corpus()
>>> corpus.add_layer_meta("text")
>>> corpus.add_layer_meta("words", layer_type="span", base="text")
>>> corpus.add_layer_meta("pos", layer_type="seq", base="words", data="string")
>>> doc = corpus.add_doc("This is a document.")
>>> doc.add_layers({"words": [(0,4), (5,7), (8,9), (10,18), (18,19)],     "pos": ["DT", "VBZ", "DT", "NN", "."]})
Source code in teanga/document.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def add_layers(self, layers:dict):
    """Add multiple layers in one go.

    Parameters:
        layers: dict
            A dictionary of layer names and values.

    Examples:
        >>> from teanga import Corpus
        >>> corpus = Corpus()
        >>> corpus.add_layer_meta("text")
        >>> corpus.add_layer_meta("words", layer_type="span", base="text")
        >>> corpus.add_layer_meta("pos", layer_type="seq", base="words", data="string")
        >>> doc = corpus.add_doc("This is a document.")
        >>> doc.add_layers({"words": [(0,4), (5,7), (8,9), (10,18), (18,19)], \
"pos": ["DT", "VBZ", "DT", "NN", "."]})
        """
    added = set(self.layers.keys())
    to_add = set(layers.keys())

    for layer in self._meta:
        if layer not in layers and self._meta[layer].default is not None:
            added.add(layer)

    while len(to_add) > 0:
        for name in to_add.copy():
            data = layers[name]
            if self._meta[name].base is None or self._meta[name].base in added:
                self[name] = data
                added.add(name)
                to_add.remove(name)
            elif (self._meta[name].base is not None
                  and self._meta[name].base not in layers
                  and self._meta[name].base not in added):
                raise Exception("Cannot add layer " + name + " because sublayer " +
                self._meta[name].base + " does not exist.")

character_layers()

Get the character layers for this document (used to calculate the ID)

Source code in teanga/document.py
224
225
226
227
228
def character_layers(self) -> dict[str, str]:
    """Get the character layers for this document (used to calculate the ID)"""
    return {layer: self.layers[layer].raw
            for layer in self.layers
            if self._meta[layer].layer_type == "characters"}

copy()

Return a copy of the document.

Source code in teanga/document.py
31
32
33
34
def copy(self):
    """Return a copy of the document."""
    return Document(self._meta, self._pyo3, self.id,
                    **{key: value for key, value in self.layers.items()})

from_json(json, meta, _pyo3=None, id=None) staticmethod

Return a document from its JSON representation.

Source code in teanga/document.py
355
356
357
358
359
360
@staticmethod
def from_json(json:dict, meta:dict, _pyo3=None, id=None) -> 'Document':
    """Return a document from its JSON representation."""
    doc = Document(meta, _pyo3, id)
    doc.add_layers(json)
    return doc

text_for_layer(layer_name)

Return the text for a layer.

Parameters:

Name Type Description Default
layer_name str

str The name of the layer.

required

Returns:

Type Description
str

A generator that yields the text for the layer.

Examples:

>>> from teanga import Corpus
>>> corpus = Corpus()
>>> corpus.add_layer_meta("text")
>>> corpus.add_layer_meta("words", layer_type="span", base="text")
>>> corpus.add_layer_meta("pos", layer_type="seq", base="words")
>>> doc = corpus.add_doc("This is a document.")
>>> doc.words = [[0,4], [5,7], [8,9], [10,18], [18,19]]
>>> doc.pos = ["DT", "VBZ", "DT", "NN", "."]
>>> list(doc.text_for_layer("text"))
['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 'd', 'o', 'c', 'u', 'm', 'e', 'n', 't', '.']
>>> list(doc.text_for_layer("words"))
['This', 'is', 'a', 'document', '.']
>>> list(doc.text_for_layer("pos"))
['This', 'is', 'a', 'document', '.']
Source code in teanga/document.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
    def text_for_layer(self, layer_name:str) -> Generator[None,None,str]:
        """Return the text for a layer.

        Parameters:
            layer_name: str
                The name of the layer.

        Returns:
            A generator that yields the text for the layer.

        Examples:
            >>> from teanga import Corpus
            >>> corpus = Corpus()
            >>> corpus.add_layer_meta("text")
            >>> corpus.add_layer_meta("words", layer_type="span", base="text")
            >>> corpus.add_layer_meta("pos", layer_type="seq", base="words")
            >>> doc = corpus.add_doc("This is a document.")
            >>> doc.words = [[0,4], [5,7], [8,9], [10,18], [18,19]]
            >>> doc.pos = ["DT", "VBZ", "DT", "NN", "."]
            >>> list(doc.text_for_layer("text"))
            ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 'd', 'o', \
'c', 'u', 'm', 'e', 'n', 't', '.']
            >>> list(doc.text_for_layer("words"))
            ['This', 'is', 'a', 'document', '.']
            >>> list(doc.text_for_layer("pos"))
            ['This', 'is', 'a', 'document', '.']
        """
        if layer_name not in self._meta:
            raise Exception("Layer with name " + layer_name + " does not exist.")
        if self._meta[layer_name].layer_type == "characters":
            return self.layers[layer_name].text[0]
        else:
            text_layer = layer_name
            while self._meta[text_layer].layer_type != "characters":
                text_layer = self._meta[text_layer].base
            indexes = self.layers[layer_name].indexes(text_layer)
            text = self.layers[text_layer].text[0]
            return (text[start:end]
                    for start, end in indexes)

to_json()

Return the JSON representation of the document.

Source code in teanga/document.py
350
351
352
353
def to_json(self) -> str:
    """Return the JSON representation of the document."""
    return {layer_id: self.layers[layer_id].raw
            for layer_id in self.layers.keys()}

view(*args, start=0, end=None, root_layer=None)

Return a view of the document. A view is a grouping of the basic text data according to the annotations in the document

Parameters:

start: int The start index of the view. end: int The end index of the view. root_layer: str The root layer of the view (this should not normally be specified).

Returns:

The text data grouped by the annotations in the document.

Examples:

from teanga import Corpus corpus = Corpus() corpus.add_layer_meta("text") corpus.add_layer_meta("words", layer_type="span", base="text") corpus.add_layer_meta("sentences", layer_type="div", base="text") doc = corpus.add_doc("This is a sentence. This is another sentence.") doc.words = [[0,4], [5,7], [8,9], [10,18], [18,19], [20,24], [25,27], [28,35], [36,44], [44,45]] doc.sentences = [0, 20] doc.view("words") ['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', 'sentence', '.'] doc.view("sentences") ['This is a sentence. ', 'This is another sentence.'] doc.view("words", "sentences") [['This', 'is', 'a', 'sentence', '.'], ['This', 'is', 'another', 'sentence', '.']]

Source code in teanga/document.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
    def view(self, *args, start: int = 0, end: int = None, root_layer: str = None):
        """Return a view of the document. A view is a grouping of the basic
          text data according to the annotations in the document

        Parameters:
        -----------

        args: list
            The layers to view the text data by.
        start: int
            The start index of the view.
        end: int
            The end index of the view.
        root_layer: str
            The root layer of the view (this should not normally be specified).

        Returns:
        --------

        The text data grouped by the annotations in the document.

        Examples:
        ---------

        >>> from teanga import Corpus
        >>> corpus = Corpus()
        >>> corpus.add_layer_meta("text")
        >>> corpus.add_layer_meta("words", layer_type="span", base="text")
        >>> corpus.add_layer_meta("sentences", layer_type="div", base="text")
        >>> doc = corpus.add_doc("This is a sentence. This is another sentence.")
        >>> doc.words = [[0,4], [5,7], [8,9], [10,18], [18,19], [20,24], \
[25,27], [28,35], [36,44], [44,45]]
        >>> doc.sentences = [0, 20]
        >>> doc.view("words")
        ['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', 'sentence', '.']
        >>> doc.view("sentences")
        ['This is a sentence. ', 'This is another sentence.']
        >>> doc.view("words", "sentences")
        [['This', 'is', 'a', 'sentence', '.'], ['This', 'is', 'another', \
'sentence', '.']]
          """
        if root_layer is None:
            for layer in args:
                if self._meta[layer].layer_type == "characters":
                    rl = layer
                else:
                    rl = self.layers[layer].root_layer()
                if root_layer is not None and rl != root_layer:
                    raise Exception("view was called with layers that have " +
                    "different root layers")
                root_layer = rl
        if root_layer is None:
            for layer in self.layers:
                if self._meta[layer].base is None:
                    if root_layer is not None:
                        raise Exception("view was called without specifying any" +
                        "layers or root layer but there are multiple root " +
                        "layers in the document")
                    root_layer = layer
                    break
        if end is None:
            end = len(self.layers["text"])
        if len(args) == 0:
            return self.text_for_layer(root_layer)[start:end]
        else:
            indexes = self.layers[args[-1]].indexes(root_layer)
            indexes = [(s, e) for s,e in indexes
                       if s >= start and e <= end]
            return [self.view(*args[:-1], start=s, end=e, root_layer=root_layer)
                    for s, e in indexes]

ElementLayer

Bases: StandoffLayer

A layer where each annotation is an element of the sublayer. This allows for multiple annotations of a single element. Typical examples are metadata elements such a titles

Source code in teanga/document.py
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
class ElementLayer(StandoffLayer):
    """A layer where each annotation is an element of the sublayer. This allows
    for multiple annotations of a single element. Typical examples are
    metadata elements such a titles"""

    def __init__(self, name:str, doc: Document, spans:list):
        super().__init__(name, doc)
        if len(spans) > 0 and any(isinstance(s,numbers.Integral) for s in spans):
            spans = [(s,) for s in spans]
        self._data = spans
        for span in self._data:
            if not isinstance(span[0], numbers.Integral):
                raise Exception("Bad span data: " + repr(span))

    @property
    def data(self):
        """
        Return the data values of the layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "alts": LayerDesc(layer_type="element", base="text", data="string" )},
            ... text="Tá sé seo mar shampla.")
            >>> doc["alts"] = [[1, 'á'], [4, 'é']]
            >>> doc["alts"].data
            ['á', 'é']
        """
        if self._meta.data is None:
            return [None] * len(self._data)
        elif self._meta.data == "link" and self._meta.link_types:
            return [(s[1], s[2]) for s in self._data]
        else:
            return [s[1] for s in self._data]


    def indexes(self, layer:str):
        """
        Return the indexes of the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "alts": LayerDesc(layer_type="element", base="text", data="string" )},
            ... text="Tá sé seo mar shampla.")
            >>> doc["alts"] = [[1, "́a"], [4, "́e"]]
            >>> doc["alts"].indexes("alts")
            [(0, 1), (1, 2)]
            >>> doc["alts"].indexes("text")
            [(1, 2), (4, 5)]
        """
        if layer == self._name:
            return list(zip(range(len(self._data)), range(1, len(self._data) + 1)))
        elif layer == self._meta.base:
            return [(_1st_idx(s), _1st_idx(s) + 1) for s in self._data]
        else:
            subindexes = list(self._doc.layers[self._meta.base].indexes(layer))
            return [subindexes[_1st_idx(s)] for s in self._data]

    def __repr__(self):
        return "ElementLayer(" + repr(self._data) + ")"

    def transform(self, transform_func):# -> Self:
        return ElementLayer(self._name, self._doc,
                            [transform_func(x) for x in self._data])

data property

Return the data values of the layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "alts": LayerDesc(layer_type="element", base="text", data="string" )},
... text="Tá sé seo mar shampla.")
>>> doc["alts"] = [[1, 'á'], [4, 'é']]
>>> doc["alts"].data
['á', 'é']

indexes(layer)

Return the indexes of the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "alts": LayerDesc(layer_type="element", base="text", data="string" )},
... text="Tá sé seo mar shampla.")
>>> doc["alts"] = [[1, "́a"], [4, "́e"]]
>>> doc["alts"].indexes("alts")
[(0, 1), (1, 2)]
>>> doc["alts"].indexes("text")
[(1, 2), (4, 5)]
Source code in teanga/document.py
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
def indexes(self, layer:str):
    """
    Return the indexes of the annotations of this layer.

    Examples:
        >>> doc = Document({"text": LayerDesc(layer_type="characters"),
        ... "alts": LayerDesc(layer_type="element", base="text", data="string" )},
        ... text="Tá sé seo mar shampla.")
        >>> doc["alts"] = [[1, "́a"], [4, "́e"]]
        >>> doc["alts"].indexes("alts")
        [(0, 1), (1, 2)]
        >>> doc["alts"].indexes("text")
        [(1, 2), (4, 5)]
    """
    if layer == self._name:
        return list(zip(range(len(self._data)), range(1, len(self._data) + 1)))
    elif layer == self._meta.base:
        return [(_1st_idx(s), _1st_idx(s) + 1) for s in self._data]
    else:
        subindexes = list(self._doc.layers[self._meta.base].indexes(layer))
        return [subindexes[_1st_idx(s)] for s in self._data]

Layer

Bases: ABC

Base class for all layers in a document.

This class defines the basic interface for all layers in a document. It is not meant to be instantiated directly, but rather to be subclassed by specific layer types such as CharacterLayer, SeqLayer, SpanLayer, etc.

Source code in teanga/document.py
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
class Layer(ABC):
    """Base class for all layers in a document.

    This class defines the basic interface for all layers in a document.
    It is not meant to be instantiated directly, but rather to be subclassed
    by specific layer types such as CharacterLayer, SeqLayer, SpanLayer, etc.
    """
    @abstractmethod
    def data(self) -> list[Union[str,int,Tuple[int,str]]]:
        """Return the data values of the layer."""
        pass

    @abstractmethod
    def raw(self) -> list:
        """Return the raw data values of the layer."""
        pass

    @abstractmethod
    def text(self) -> list[str]:
        """Return the underlying text grouped by the annotations of this layer."""
        pass

    def text_data(self) -> Generator[None,None,
                                     tuple[str,Union[str,int,Tuple[int,str]]]]:
        """Return a list of pairs of the underlying text grouped by the
        annotations of this layer and the data values of the layer."""
        return zip(self.text, self.data)

    @abstractmethod
    def indexes(self, layer:str) -> list[Tuple[int,int]]:
        """Return the indexes of the annotations of this layer."""
        pass

    def indexes_data(self, layer:str) -> Generator[None,None,
            tuple[Tuple[int,int],Union[str,int,Tuple[int,str]]]]:
        """Return a list of pairs of the data values of the layer and
        the indexes of the annotations of this layer.

        Examples:
            >>> from .layer_desc import LayerDesc
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "words": LayerDesc(layer_type="seq", base="text")})
            >>> doc["text"] = "This"
            >>> list(doc["text"].indexes_data("text"))
            [((0, 1), None), ((1, 2), None), ((2, 3), None), ((3, 4), None)]
            >>> doc["words"] = ["A", "B", "C", "D"]
            >>> list(doc["words"].indexes_data("words"))
            [((0, 1), 'A'), ((1, 2), 'B'), ((2, 3), 'C'), ((3, 4), 'D')]
        """
        return zip(self.indexes(layer), self.data)

    @abstractmethod
    def transform(self, transform_func):# -> Self:
        """Transform the layer using a transformation function."""
        pass

data() abstractmethod

Return the data values of the layer.

Source code in teanga/document.py
491
492
493
494
@abstractmethod
def data(self) -> list[Union[str,int,Tuple[int,str]]]:
    """Return the data values of the layer."""
    pass

indexes(layer) abstractmethod

Return the indexes of the annotations of this layer.

Source code in teanga/document.py
512
513
514
515
@abstractmethod
def indexes(self, layer:str) -> list[Tuple[int,int]]:
    """Return the indexes of the annotations of this layer."""
    pass

indexes_data(layer)

Return a list of pairs of the data values of the layer and the indexes of the annotations of this layer.

Examples:

>>> from .layer_desc import LayerDesc
>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "words": LayerDesc(layer_type="seq", base="text")})
>>> doc["text"] = "This"
>>> list(doc["text"].indexes_data("text"))
[((0, 1), None), ((1, 2), None), ((2, 3), None), ((3, 4), None)]
>>> doc["words"] = ["A", "B", "C", "D"]
>>> list(doc["words"].indexes_data("words"))
[((0, 1), 'A'), ((1, 2), 'B'), ((2, 3), 'C'), ((3, 4), 'D')]
Source code in teanga/document.py
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
def indexes_data(self, layer:str) -> Generator[None,None,
        tuple[Tuple[int,int],Union[str,int,Tuple[int,str]]]]:
    """Return a list of pairs of the data values of the layer and
    the indexes of the annotations of this layer.

    Examples:
        >>> from .layer_desc import LayerDesc
        >>> doc = Document({"text": LayerDesc(layer_type="characters"),
        ... "words": LayerDesc(layer_type="seq", base="text")})
        >>> doc["text"] = "This"
        >>> list(doc["text"].indexes_data("text"))
        [((0, 1), None), ((1, 2), None), ((2, 3), None), ((3, 4), None)]
        >>> doc["words"] = ["A", "B", "C", "D"]
        >>> list(doc["words"].indexes_data("words"))
        [((0, 1), 'A'), ((1, 2), 'B'), ((2, 3), 'C'), ((3, 4), 'D')]
    """
    return zip(self.indexes(layer), self.data)

raw() abstractmethod

Return the raw data values of the layer.

Source code in teanga/document.py
496
497
498
499
@abstractmethod
def raw(self) -> list:
    """Return the raw data values of the layer."""
    pass

text() abstractmethod

Return the underlying text grouped by the annotations of this layer.

Source code in teanga/document.py
501
502
503
504
@abstractmethod
def text(self) -> list[str]:
    """Return the underlying text grouped by the annotations of this layer."""
    pass

text_data()

Return a list of pairs of the underlying text grouped by the annotations of this layer and the data values of the layer.

Source code in teanga/document.py
506
507
508
509
510
def text_data(self) -> Generator[None,None,
                                 tuple[str,Union[str,int,Tuple[int,str]]]]:
    """Return a list of pairs of the underlying text grouped by the
    annotations of this layer and the data values of the layer."""
    return zip(self.text, self.data)

transform(transform_func) abstractmethod

Transform the layer using a transformation function.

Source code in teanga/document.py
535
536
537
538
@abstractmethod
def transform(self, transform_func):# -> Self:
    """Transform the layer using a transformation function."""
    pass

SeqLayer

Bases: DataLayer

A layer that is in one-to-one correspondence with its sublayer. Typical examples are POS tags, lemmas, etc.

Source code in teanga/document.py
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
class SeqLayer(DataLayer):
    """A layer that is in one-to-one correspondence with its sublayer.
    Typical examples are POS tags, lemmas, etc."""
    def __init__(self, name:str, doc:Document, seq:list):
        super().__init__(name, doc)
        self._data = seq

    @property
    def data(self):
        """
        Return the data values of the layer.

        Examples:
            >>> d = Document({"text": LayerDesc(layer_type="characters"),
            ... "is_num": LayerDesc(layer_type="seq", base="text")})
            >>> d["text"] = "A0B"
            >>> d["is_num"] = [0, 1, 0]
            >>> d["is_num"].data
            [0, 1, 0]
        """
        return self._data

    @property
    def raw(self):
        return self._data

    @property
    def text(self):
        """
        Return the underlying text grouped by the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "is_num": LayerDesc(layer_type="seq", base="text")},
            ... text="A0B")
            >>> doc["is_num"] = [0,1,0]
            >>> doc["is_num"].text
            ['A', '0', 'B']
        """
        return list(self._doc.text_for_layer(self._name))

    def indexes(self, layer:str):
        """
        Return the indexes of the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "is_num": LayerDesc(layer_type="seq", base="text")},
            ... text="AOB")
            >>> doc["is_num"] = [0,1,0]
            >>> doc["is_num"].indexes("text")
            [(0, 1), (1, 2), (2, 3)]
        """
        if layer == self._name:
            return [(i, i+1) for i in range(len(self._data))]
        else:
            return self._doc.layers[self._meta.base].indexes(layer)

    def __repr__(self):
        return "SeqLayer(" + repr(self._data) + ")"

    def transform(self, transform_func):# -> Self:
        return SeqLayer(self._name, self._doc, [transform_func(x) for x in self.seq])

data property

Return the data values of the layer.

Examples:

>>> d = Document({"text": LayerDesc(layer_type="characters"),
... "is_num": LayerDesc(layer_type="seq", base="text")})
>>> d["text"] = "A0B"
>>> d["is_num"] = [0, 1, 0]
>>> d["is_num"].data
[0, 1, 0]

text property

Return the underlying text grouped by the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "is_num": LayerDesc(layer_type="seq", base="text")},
... text="A0B")
>>> doc["is_num"] = [0,1,0]
>>> doc["is_num"].text
['A', '0', 'B']

indexes(layer)

Return the indexes of the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "is_num": LayerDesc(layer_type="seq", base="text")},
... text="AOB")
>>> doc["is_num"] = [0,1,0]
>>> doc["is_num"].indexes("text")
[(0, 1), (1, 2), (2, 3)]
Source code in teanga/document.py
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
def indexes(self, layer:str):
    """
    Return the indexes of the annotations of this layer.

    Examples:
        >>> doc = Document({"text": LayerDesc(layer_type="characters"),
        ... "is_num": LayerDesc(layer_type="seq", base="text")},
        ... text="AOB")
        >>> doc["is_num"] = [0,1,0]
        >>> doc["is_num"].indexes("text")
        [(0, 1), (1, 2), (2, 3)]
    """
    if layer == self._name:
        return [(i, i+1) for i in range(len(self._data))]
    else:
        return self._doc.layers[self._meta.base].indexes(layer)

SpanLayer

Bases: StandoffLayer

A layer that defines spans of the sublayer which are annotated. Typical examples are tokens, named entities, chunks, etc.

Source code in teanga/document.py
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
class SpanLayer(StandoffLayer):
    """A layer that defines spans of the sublayer which are annotated.
    Typical examples are tokens, named entities, chunks, etc."""
    def __init__(self, name:str, doc: Document, spans:list):
        super().__init__(name, doc)
        self._data = spans
        for span in self._data:
            if not isinstance(span[0], numbers.Integral):
                raise Exception("Bad span data: " + repr(span))
            if not isinstance(span[1], numbers.Integral):
                raise Exception("Bad span data: " + repr(span))

    @property
    def data(self):
        """
        Return the data values of the layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "words": LayerDesc(layer_type="span", base="text", data="string")},
            ... text="This is an example.")
            >>> doc["words"] = [[0,4,"A"], [5,7,"B"], [8,10,"C"],
            ... [11,18,"D"]]
            >>> doc["words"].data
            ['A', 'B', 'C', 'D']
        """
        if self._meta.data is None:
            return [None] * len(self._data)
        elif self._meta.data == "link" and self._meta.link_types:
            return [(s[2], s[3]) for s in self._data]
        else:
            return [s[2] for s in self._data]


    def indexes(self, layer:str):
        """
        Return the indexes of the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "words": LayerDesc(layer_type="span", base="text", data="string")},
            ... text="This is an example.")
            >>> doc["words"] = [[0,4,"A"], [5,7,"B"], [8,10,"C"],
            ... [11,18,"D"]]
            >>> doc["words"].indexes("words")
            [(0, 1), (1, 2), (2, 3), (3, 4)]
            >>> doc["words"].indexes("text")
            [(0, 4), (5, 7), (8, 10), (11, 18)]
        """
        if layer == self._name:
            return list(zip(range(len(self._data)), range(1, len(self._data) + 1)))
        elif layer == self._meta.base:
            return [(s[0], s[1]) for s in self._data]
        else:
            subindexes = list(self._doc.layers[self._meta.base].indexes(layer))
            return [(subindexes[s[0]][0], subindexes[s[1]-1][1]) for s in self._data]

    def __repr__(self):
        return "SpanLayer(" + repr(self._data) + ")"

    def transform(self, transform_func):# -> Self:
        return SpanLayer(self._name, self._doc, [transform_func(x) for x in self._data])

data property

Return the data values of the layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "words": LayerDesc(layer_type="span", base="text", data="string")},
... text="This is an example.")
>>> doc["words"] = [[0,4,"A"], [5,7,"B"], [8,10,"C"],
... [11,18,"D"]]
>>> doc["words"].data
['A', 'B', 'C', 'D']

indexes(layer)

Return the indexes of the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "words": LayerDesc(layer_type="span", base="text", data="string")},
... text="This is an example.")
>>> doc["words"] = [[0,4,"A"], [5,7,"B"], [8,10,"C"],
... [11,18,"D"]]
>>> doc["words"].indexes("words")
[(0, 1), (1, 2), (2, 3), (3, 4)]
>>> doc["words"].indexes("text")
[(0, 4), (5, 7), (8, 10), (11, 18)]
Source code in teanga/document.py
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
def indexes(self, layer:str):
    """
    Return the indexes of the annotations of this layer.

    Examples:
        >>> doc = Document({"text": LayerDesc(layer_type="characters"),
        ... "words": LayerDesc(layer_type="span", base="text", data="string")},
        ... text="This is an example.")
        >>> doc["words"] = [[0,4,"A"], [5,7,"B"], [8,10,"C"],
        ... [11,18,"D"]]
        >>> doc["words"].indexes("words")
        [(0, 1), (1, 2), (2, 3), (3, 4)]
        >>> doc["words"].indexes("text")
        [(0, 4), (5, 7), (8, 10), (11, 18)]
    """
    if layer == self._name:
        return list(zip(range(len(self._data)), range(1, len(self._data) + 1)))
    elif layer == self._meta.base:
        return [(s[0], s[1]) for s in self._data]
    else:
        subindexes = list(self._doc.layers[self._meta.base].indexes(layer))
        return [(subindexes[s[0]][0], subindexes[s[1]-1][1]) for s in self._data]

StandoffLayer

Bases: DataLayer

Common superclass of span, div and element layers. Cannot be used directly

Source code in teanga/document.py
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
class StandoffLayer(DataLayer):
    """Common superclass of span, div and element layers. Cannot be used
    directly"""
    @property
    def raw(self):
        return self._data

    @property
    def text(self):
        """
        Return the underlying text grouped by the annotations of this layer.

        Examples:
            >>> doc = Document({"text": LayerDesc(layer_type="characters"),
            ... "words": LayerDesc(layer_type="span", base="text", data="string")},
            ... text="This is an example.")
            >>> doc["words"] = [[0,4,"A"], [5,7,"B"], [8,10,"C"],
            ... [11,18,"D"]]
            >>> doc["words"].text
            ['This', 'is', 'an', 'example']
        """
        return list(self._doc.text_for_layer(self._name))

    def __len__(self):
        return len(self._data)

text property

Return the underlying text grouped by the annotations of this layer.

Examples:

>>> doc = Document({"text": LayerDesc(layer_type="characters"),
... "words": LayerDesc(layer_type="span", base="text", data="string")},
... text="This is an example.")
>>> doc["words"] = [[0,4,"A"], [5,7,"B"], [8,10,"C"],
... [11,18,"D"]]
>>> doc["words"].text
['This', 'is', 'an', 'example']

clip_string(s)

Reduce a string to maximum of 100 characters.

Source code in teanga/document.py
400
401
402
403
404
405
def clip_string(s):
    """Reduce a string to maximum of 100 characters."""
    if len(s) > 100:
        return s[:90] + "... " + s[-7:]
    else:
        return s

validate_value(value, index_length)

Validate a single value in a layer and normalise it if necessary.

Values must be 0-2 integer indexes followed by a string, an integer or an integer and then a string. If this results in a list of 1 element that list should be dropped.

Source code in teanga/document.py
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
def validate_value(value, index_length):
    """Validate a single value in a layer and normalise it if necessary.

    Values must be 0-2 integer indexes followed by a string, an integer or an
    integer and then a string. If this results in a list of 1 element that list
    should be dropped."""
    if isinstance(value, tuple):
        value = list(value)
    if not isinstance(value, list):
        if index_length >= 2:
            raise Exception("Bad value: " + repr(value))
        if index_length == 1 and not isinstance(value, numbers.Integral):
            raise Exception("Bad value: " + repr(value))
        if (index_length == 0 and not isinstance(value, str) and
            not isinstance(value, numbers.Integral)):
            raise Exception("Bad value: " + repr(value))
        return value
    else:
        if index_length > 0:
            for i in range(index_length):
                if not isinstance(value[i], numbers.Integral):
                    raise Exception("Bad value: " + repr(value))
        if len(value) == 1:
            if (not isinstance(value[0], str)
                and not isinstance(value[0], numbers.Integral)):
                raise Exception("Bad value: " + repr(value))
            return value[0]
        elif len(value) == index_length:
            return value
        elif len(value) == index_length + 1:
            import sys
            if (not isinstance(value[index_length], str)
                    and not isinstance(value[index_length], numbers.Integral)):
                raise Exception("Bad value: " + repr(value))
            return value
        elif len(value) == index_length + 2:
            if (not isinstance(value[index_length], numbers.Integral)
                    or not isinstance(value[index_length + 1], str)):
                raise Exception("Bad value: " + repr(value))
            return value
        else:
            raise Exception("Bad value: " + repr(value))