Skip to content

Teanga Transforms Module

TransformedCorpus

Bases: ImmutableCorpus

A corpus that lazily applies a transformation to its documents.

Source code in teanga/transforms.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class TransformedCorpus(ImmutableCorpus):
    """A corpus that lazily applies a transformation to its documents."""
    def __init__(self, corpus : 'ImmutableCorpus', transform : dict[str, Callable[[str], str]]):
        """Create a new TransformedCorpus.

        Args:
            corpus: The corpus to transform.
            transform: A dictionary mapping layer names to functions 
            that transform the layer.
        """
        self.corpus = corpus
        self._transform = transform

    def add_meta_from_service(self, service: 'Service'):
        """Add metadata from a service to the corpus.
        See Corpus.add_meta_from_service for more information.
        """
        self.corpus.add_meta_from_service(service)

    def add_layer_meta(self, name:str=None,
                  layer_type:str="characters", base:str=None, 
                  data=None, link_types:list[str]=None,
                  target:str=None, default=None):
        """Add a layer to the corpus.
        See Corpus.add_layer_meta for more information.
        """
        self.corpus.add_layer_meta(name, layer_type, base, 
                                   data, link_types, target, default)

    def add_doc(self, *args, **kwargs) -> 'Document':
        """Add a document to the corpus.
        See Corpus.add_doc for more information.
        """
        return self.corpus.add_doc(*args, **kwargs)

    def doc_ids(self) -> list[str]:
        """Return a list of document ids in the corpus.
        See Corpus.doc_ids for more information.
        """
        return self.corpus.doc_ids()

    @property
    def docs(self) -> Iterator['Document']:
        """Return an iterator over the documents in the corpus.
        See Corpus.docs for more information.
        """
        for doc in self.corpus.docs:
           yield self.transform_doc(doc)

    def transform_doc(self, doc: 'Document') -> 'Document':
        """Transform a document using the transformation functions.

        Args:
            doc: The document to transform.

        Returns:
            A new document with the transformed layers.
        """
        new_doc = doc.copy()
        for layer_name, transform in self._transform.items():
            new_doc[layer_name] = doc.layers[layer_name].transform(transform)
        return new_doc

    def doc_by_id(self, doc_id: str) -> 'Document':
        """Return a document by its id.
        See Corpus.doc_by_id for more information.
        """
        return self.transform_doc(self.corpus.doc_by_id(doc_id))

    @property
    def meta(self) -> dict:
        """Return the metadata of the corpus.
        See Corpus.meta for more information.
        """
        return self.corpus.meta

    @meta.setter
    def meta(self, value: dict):
        """Set the metadata of the corpus.
        See Corpus.meta for more information.
        """
        self.corpus.meta = value

    def apply(self, service: 'Service'):
        """Apply a service to the corpus.
        See Corpus.apply for more information.
        """
        self.corpus.apply(service)

    def lower(self):# -> Self:
        """Lowercase all the text in the corpus. """
        text_layers = [layer for layer in self.meta 
                       if self.meta[layer].layer_type == "characters"]
        new_transform = self._transform.copy()
        for layer in text_layers:
            if layer in self._transform:
                new_transform[layer] = lambda x: self._transform[layer](x).lower()
            else:
                new_transform[layer] = lambda x: x.lower()
        return TransformedCorpus(self, new_transform)

    def upper(self):# -> Self:
        """Uppercase all the text in the corpus. """
        text_layers = [layer for layer in self.meta 
                       if self.meta[layer].layer_type == "characters"]
        new_transform = self._transform.copy()
        for layer in text_layers:
            if layer in self._transform:
                new_transform[layer] = lambda x: self._transform[layer](x).upper()
            else:
                new_transform[layer] = lambda x: x.upper()
        return TransformedCorpus(self, new_transform)

    def transform(self, layer: str, transform: 
                  Callable[[str], str]):# -> Self:
        """Transform a layer in the corpus.

        Parameters:
            layer: str
                The name of the layer to transform.
            transform: Callable[[str], str]
                The transformation function.

        Examples:
            >>> import teanga
            >>> corpus = teanga.text_corpus()
            >>> doc = corpus.add_doc("This is a document.")
            >>> corpus = corpus.upper().transform("text", lambda x: x[:10])
            >>> list(corpus.docs)
            [Document('Kjco', {'text': 'THIS IS A '})]
        """
        new_transform = self._transform.copy()
        if layer in self._transform:
            new_transform[layer] = lambda x: transform(self._transform[layer](x))
        else:
            new_transform[layer] = transform
        return TransformedCorpus(self, new_transform)

docs property

Return an iterator over the documents in the corpus. See Corpus.docs for more information.

meta property writable

Return the metadata of the corpus. See Corpus.meta for more information.

__init__(corpus, transform)

Create a new TransformedCorpus.

Parameters:

Name Type Description Default
corpus ImmutableCorpus

The corpus to transform.

required
transform dict[str, Callable[[str], str]]

A dictionary mapping layer names to functions

required
Source code in teanga/transforms.py
 8
 9
10
11
12
13
14
15
16
17
def __init__(self, corpus : 'ImmutableCorpus', transform : dict[str, Callable[[str], str]]):
    """Create a new TransformedCorpus.

    Args:
        corpus: The corpus to transform.
        transform: A dictionary mapping layer names to functions 
        that transform the layer.
    """
    self.corpus = corpus
    self._transform = transform

add_doc(*args, **kwargs)

Add a document to the corpus. See Corpus.add_doc for more information.

Source code in teanga/transforms.py
35
36
37
38
39
def add_doc(self, *args, **kwargs) -> 'Document':
    """Add a document to the corpus.
    See Corpus.add_doc for more information.
    """
    return self.corpus.add_doc(*args, **kwargs)

add_layer_meta(name=None, layer_type='characters', base=None, data=None, link_types=None, target=None, default=None)

Add a layer to the corpus. See Corpus.add_layer_meta for more information.

Source code in teanga/transforms.py
25
26
27
28
29
30
31
32
33
def add_layer_meta(self, name:str=None,
              layer_type:str="characters", base:str=None, 
              data=None, link_types:list[str]=None,
              target:str=None, default=None):
    """Add a layer to the corpus.
    See Corpus.add_layer_meta for more information.
    """
    self.corpus.add_layer_meta(name, layer_type, base, 
                               data, link_types, target, default)

add_meta_from_service(service)

Add metadata from a service to the corpus. See Corpus.add_meta_from_service for more information.

Source code in teanga/transforms.py
19
20
21
22
23
def add_meta_from_service(self, service: 'Service'):
    """Add metadata from a service to the corpus.
    See Corpus.add_meta_from_service for more information.
    """
    self.corpus.add_meta_from_service(service)

apply(service)

Apply a service to the corpus. See Corpus.apply for more information.

Source code in teanga/transforms.py
89
90
91
92
93
def apply(self, service: 'Service'):
    """Apply a service to the corpus.
    See Corpus.apply for more information.
    """
    self.corpus.apply(service)

doc_by_id(doc_id)

Return a document by its id. See Corpus.doc_by_id for more information.

Source code in teanga/transforms.py
69
70
71
72
73
def doc_by_id(self, doc_id: str) -> 'Document':
    """Return a document by its id.
    See Corpus.doc_by_id for more information.
    """
    return self.transform_doc(self.corpus.doc_by_id(doc_id))

doc_ids()

Return a list of document ids in the corpus. See Corpus.doc_ids for more information.

Source code in teanga/transforms.py
41
42
43
44
45
def doc_ids(self) -> list[str]:
    """Return a list of document ids in the corpus.
    See Corpus.doc_ids for more information.
    """
    return self.corpus.doc_ids()

lower()

Lowercase all the text in the corpus.

Source code in teanga/transforms.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
def lower(self):# -> Self:
    """Lowercase all the text in the corpus. """
    text_layers = [layer for layer in self.meta 
                   if self.meta[layer].layer_type == "characters"]
    new_transform = self._transform.copy()
    for layer in text_layers:
        if layer in self._transform:
            new_transform[layer] = lambda x: self._transform[layer](x).lower()
        else:
            new_transform[layer] = lambda x: x.lower()
    return TransformedCorpus(self, new_transform)

transform(layer, transform)

Transform a layer in the corpus.

Parameters:

Name Type Description Default
layer str

str The name of the layer to transform.

required
transform Callable[[str], str]

Callable[[str], str] The transformation function.

required

Examples:

>>> import teanga
>>> corpus = teanga.text_corpus()
>>> doc = corpus.add_doc("This is a document.")
>>> corpus = corpus.upper().transform("text", lambda x: x[:10])
>>> list(corpus.docs)
[Document('Kjco', {'text': 'THIS IS A '})]
Source code in teanga/transforms.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def transform(self, layer: str, transform: 
              Callable[[str], str]):# -> Self:
    """Transform a layer in the corpus.

    Parameters:
        layer: str
            The name of the layer to transform.
        transform: Callable[[str], str]
            The transformation function.

    Examples:
        >>> import teanga
        >>> corpus = teanga.text_corpus()
        >>> doc = corpus.add_doc("This is a document.")
        >>> corpus = corpus.upper().transform("text", lambda x: x[:10])
        >>> list(corpus.docs)
        [Document('Kjco', {'text': 'THIS IS A '})]
    """
    new_transform = self._transform.copy()
    if layer in self._transform:
        new_transform[layer] = lambda x: transform(self._transform[layer](x))
    else:
        new_transform[layer] = transform
    return TransformedCorpus(self, new_transform)

transform_doc(doc)

Transform a document using the transformation functions.

Parameters:

Name Type Description Default
doc Document

The document to transform.

required

Returns:

Type Description
Document

A new document with the transformed layers.

Source code in teanga/transforms.py
55
56
57
58
59
60
61
62
63
64
65
66
67
def transform_doc(self, doc: 'Document') -> 'Document':
    """Transform a document using the transformation functions.

    Args:
        doc: The document to transform.

    Returns:
        A new document with the transformed layers.
    """
    new_doc = doc.copy()
    for layer_name, transform in self._transform.items():
        new_doc[layer_name] = doc.layers[layer_name].transform(transform)
    return new_doc

upper()

Uppercase all the text in the corpus.

Source code in teanga/transforms.py
107
108
109
110
111
112
113
114
115
116
117
def upper(self):# -> Self:
    """Uppercase all the text in the corpus. """
    text_layers = [layer for layer in self.meta 
                   if self.meta[layer].layer_type == "characters"]
    new_transform = self._transform.copy()
    for layer in text_layers:
        if layer in self._transform:
            new_transform[layer] = lambda x: self._transform[layer](x).upper()
        else:
            new_transform[layer] = lambda x: x.upper()
    return TransformedCorpus(self, new_transform)