From 679bdafd1749d19272a090e6e73b8dba033edb55 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Sat, 14 Aug 2021 23:48:38 -0700 Subject: [PATCH] Prepare for release 0.5.1 --- CHANGELOG.md | 5 +- docs/index.html | 141 +++++++++++++++++++++++++++++++++++---- rtg/__init__.py | 2 +- scripts/rtg-translate.py | 102 ++++++++++++++++++++++++++++ 4 files changed, 234 insertions(+), 16 deletions(-) create mode 100644 scripts/rtg-translate.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 45e3f1e..2c118ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -# v0.5.1 : WIP +# v0.5.1 : 20210814 - Add `rtg-params` command that shows trainable parameters in model (layer wise as well as total) - `rtg.serve` supports flexible transformations on source (pre processing) and target (post processing) -- Travis build configured to auto run tests +- Travis build configured to auto run tests +- sequence classification is now supported via `tfmcls` model # v0.5.0 : 20210329 diff --git a/docs/index.html b/docs/index.html index ec9802e..d5e1a31 100644 --- a/docs/index.html +++ b/docs/index.html @@ -4,7 +4,7 @@ - + Reader-Translator-Generator (RTG) @@ -95,9 +95,6 @@ abbr,acronym{text-transform:uppercase;font-size:90%;color:rgba(0,0,0,.8);border-bottom:1px dotted #ddd;cursor:help} abbr{text-transform:none} blockquote{margin:0 0 1.25em;padding:.5625em 1.25em 0 1.1875em;border-left:1px solid #ddd} -blockquote cite{display:block;font-size:.9375em;color:rgba(0,0,0,.6)} -blockquote cite::before{content:"\2014 \0020"} -blockquote cite a,blockquote cite a:visited{color:rgba(0,0,0,.6)} blockquote,blockquote p{line-height:1.6;color:rgba(0,0,0,.85)} @media screen and (min-width:768px){h1,h2,h3,#toctitle,.sidebarblock>.content>.title,h4,h5,h6{line-height:1.2} h1{font-size:2.75em} @@ -262,7 +259,7 @@ .quoteblock.excerpt>blockquote,.quoteblock .quoteblock{padding:0 0 .25em 1em;border-left:.25em solid #dddddf} .quoteblock.excerpt,.quoteblock .quoteblock{margin-left:0} .quoteblock.excerpt blockquote,.quoteblock.excerpt p,.quoteblock .quoteblock blockquote,.quoteblock .quoteblock p{color:inherit;font-size:1.0625rem} -.quoteblock.excerpt .attribution,.quoteblock .quoteblock .attribution{color:inherit;text-align:left;margin-right:0} +.quoteblock.excerpt .attribution,.quoteblock .quoteblock .attribution{color:inherit;font-size:.85rem;text-align:left;margin-right:0} p.tableblock:last-child{margin-bottom:0} td.tableblock>.content{margin-bottom:1.25em;word-wrap:anywhere} td.tableblock>.content>:last-child{margin-bottom:-1.25em} @@ -525,11 +522,17 @@

Reader-Translator-Generator (RTG)

  • 8. Distributed Data Parallel (DDP)
  • 9. FP16, Mixed Precision Training
  • 10. Scaling to Big Datasets Using PySpark
  • -
  • 11. RTG Serve
  • -
  • 12. Development Environment: +
  • 11. RTG Serve +
  • +
  • 12. Pre-process and post-process
  • +
  • 13. Development Environment: +
  • @@ -2059,6 +2062,19 @@

    11. RTG Serve

    RTG model can be served using Flask Server.

    +
    +

    11.1. Flask Installation

    +
    +
    +
    $ pip install rtg[serve]
    +
    +
    +
    +

    Flask has its own set of dependencies unrelated to the core functionality, hence, not installed when installing rtg.

    +
    +
    +
    +

    11.2. Running

    $ python -m rtg.serve -h  # rtg-serve
    @@ -2162,11 +2178,110 @@ 

    11. RTG Serve

    + +
    +

    12. Pre-process and post-process

    +
    +
    +

    The input/source text given to the API must be pre-processed in the same settings as the preprocessing during training phase. So, we offer configurations to match the preprocessing:

    +
    +
    +
      +
    • +

      src_pre_proc: List of transformations to be used on source text before giving to model (e.g. tokenizer, lowercase)

      +
    • +
    • +

      tgt_pre_proc: List of transformations to be used on target text before giving to model (e.g. tokenizer, lowercase)

      +
    • +
    • +

      tgt_post_proc: List of transformations to be used on target text produced by model (e.g. detokenizer, removal of unk)

      +
    • +
    +
    +
    +

    The following transformations are built into RTG, so you may simply use their name:

    +
    +
    +
    +
    transformers  = {
    +    'no_op': lambda x: x,
    +    'space_tok': lambda x: ' '.join(x.strip().split()),  # removes extra white spaces
    +    'space_detok': lambda toks: ' '.join(toks),
    +    'moses_tok': partial(MosesTokenizer().tokenize, escape=False, return_str=True,
    +                         aggressive_dash_splits=True,
    +                         protected_patterns=MosesTokenizer.WEB_PROTECTED_PATTERNS),
    +    'moses_detok': partial(MosesDetokenizer().detokenize, return_str=True, unescape=True),
    +    'moses_truecase': partial(MosesTruecaser().truecase, return_str=True),
    +    'lowercase': lambda x: x.lower(),
    +    'drop_unk': lambda x: x.replace('<unk>', ''),
    +    'html_unescape': html.unescape,
    +    'punct_norm': MosesPunctNormalizer().normalize
    +}
    +
    +
    +
    +

    When no arguments are given to {src_pre,tgt_pre,tgt_prop}_proc are missing, we use the same sensible defaults (same as the ones used in https://aclanthology.org/2021.acl-demo.37/.)

    +
    +
    +
    +
    src_pre_proc:
    +  - html_unescape
    +  - punct_norm
    +  - moses_tok
    +tgt_post_proc:
    +  - moses_detok
    +  - drop_unk
    +
    +
    +
    +

    You may also use shell command line, including unix pipes, by prefixing your command with "#!". In addition, you may mix shell commands with known (pythonic) transforms. Example:

    +
    +
    +
    +
    prep:
    +  src_pre_proc:
    +    - "#!/path/to/normalizer.perl | /path/to/tokenizer.py --lang deu"
    +    - lowercase
    +  tgt_post_proc:
    +    - drop_unk
    +    - moses_detok
    +
    +
    +
    +
    Disabling pre- and post- processing
    +
      +
    • +

      You may permanently disable preprocessing and post processing using

      +
    • +
    +
    +
    +
    +
    prep:
    +  src_pre_proc:
    +    - no_op
    +  tgt_post_proc:
    +    - no_op
    +
    +
    +
    + +
    +
    +

    NOTE: + {src,tgt}_pre_proc and tgt_post_proc are only used by REST API as of now. rtg.decode and rtg.prep do not yet to use pre- and post- text transformers.

    +
    +
    +
    -

    12. Development Environment:

    +

    13. Development Environment:

    -

    12.1. Run Tests

    +

    13.1. Run Tests

    Test cases are done using the pytest framework. It can be installed using pip install pytest

    @@ -2206,7 +2321,7 @@

    12.1. Run Tests

    -

    12.2. Adding a new model

    +

    13.2. Adding a new model

    1. @@ -2263,7 +2378,7 @@

      12.2. Adding a new model

    diff --git a/rtg/__init__.py b/rtg/__init__.py index ac3e6fb..0d244cf 100644 --- a/rtg/__init__.py +++ b/rtg/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.5.1-dev' +__version__ = '0.5.1' import os import logging diff --git a/scripts/rtg-translate.py b/scripts/rtg-translate.py new file mode 100644 index 0000000..40db31f --- /dev/null +++ b/scripts/rtg-translate.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# +# Author: Thamme Gowda [tg (at) isi (dot) edu] +# Created: 8/10/21 + + +import logging as log +import requests +from typing import List, Iterator, Union +from tqdm import tqdm +import json + + +log.basicConfig(level=log.INFO) +DEF_API = "https://localhost:6060/translate" +DEF_BATCHSIZE = 10 + +class RTGClient: + + def __init__(self, api_url: str): + log.info(f"Creating RTG API Client for {api_url}") + self.api_url = api_url + + def translate(self, sents: List[str]): + assert isinstance(sents, list) + assert len(sents) > 0 + assert isinstance(sents[0], str) + sents = [s.strip() or '.' for s in sents] # insert dot for empty + + data = {'source': sents} + resp = requests.post(self.api_url, json=data) + if resp.status_code != 200: + log.warning(f"Oops! something went wrong. Check logs. See if {self.api_url} is valid") + result = resp.json() + result = result['translation'] + assert len(result) == len(sents) + return result + + def translate_all(self, sents: Union[List[str], Iterator[str]], batch_size: int, + tsv_mode=False): + buffer = [] + ids = [] + total = len(sents) if hasattr(sents, '__len__') else None + log.info(f"Translating: batch_size {batch_size}; total={total or 'unknown'}") + for sent in tqdm(sents, total=total): + if tsv_mode: + id, sent = sent.split('\t') + ids.append(id) + buffer.append(sent) + if len(buffer) >= batch_size: + result = self.translate(buffer) + if tsv_mode: + assert len(ids) == len(buffer) + result = [f'{id}\t{txt}' for id, txt in zip(ids, result)] + ids.clear() + yield from result + buffer.clear() + + if buffer: + result = self.translate(buffer) + if tsv_mode: + assert len(ids) == len(buffer) + result = [f'{id}\t{txt}' for id, txt in zip(ids, result)] + ids.clear() + yield from result + + +def main(**args): + args = args or vars(parse_args()) + client = RTGClient(api_url=args['api']) + sents = args['inp'] + + result = client.translate_all(sents=sents, batch_size=args['batch_size'], + tsv_mode=args.get('tsv')) + out = args['out'] + count = 1 + for sent in result: + out.write(f'{sent}\n') + count += 1 + log.info(f"Wrote {count} lines to {out}") + +def parse_args(): + import argparse + import sys + import io + stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore') + stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore') + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + p.add_argument('-a', '--api', default=DEF_API, help='API URL') + p.add_argument('-b', '--batch-size', default=DEF_BATCHSIZE, help='Batch size') + p.add_argument('-i', '--inp', type=argparse.FileType('r'), default=stdin, + help='Input file path') + p.add_argument('-o', '--out', type=argparse.FileType('w'), default=stdout, + help='Output file path') + p.add_argument('-tsv', '--tsv', action='store_true', help='Input is TSV of \\t') + + return p.parse_args() + + +if __name__ == '__main__': + main()