diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..b506150 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,98 @@ +version: 2 + +job_options: + - &default_job + docker: + - image: cimg/python:3.11.13 + +step_options: + - &restore_pip_cache_options + keys: + - v3-pip-{{ arch }}-{{ checksum "pyproject.toml" }}-{{ checksum "requirements.txt" }} + - v3-pip-{{ arch }}-{{ checksum "pyproject.toml" }} + - v3-pip-{{ arch }} + - v3-pip- + - &save_pip_cache_options + key: v3-pip-{{ arch }}-{{ checksum "pyproject.toml" }}-{{ checksum "requirements.txt" }} + paths: + - ~/.local + - ~/.cache + - &restore_venv_cache_options + keys: + - v3-venv-{{ arch }}-{{ checksum "pyproject.toml" }}-{{ checksum "requirements.txt" }} + - v3-venv-{{ arch }}-{{ checksum "pyproject.toml" }} + - v3-venv-{{ arch }} + - v3-venv- + - &save_venv_cache_options + key: v3-venv-{{ arch }}-{{ checksum "pyproject.toml" }}-{{ checksum "requirements.txt" }} + paths: + - /home/circleci/project/venv + +steps: + - &restore_pip_cache + restore_cache: + << : *restore_pip_cache_options + - &save_pip_cache + save_cache: + << : *save_pip_cache_options + - &restore_venv_cache + restore_cache: + << : *restore_venv_cache_options + - &save_venv_cache + save_cache: + << : *save_venv_cache_options + +jobs: + prepare_cache: + << : *default_job + steps: + - checkout + - *restore_pip_cache + - *restore_venv_cache + - run: + name: Install dependencies + command: make ci-dev-install + - *save_pip_cache + - *save_venv_cache + + lint: + << : *default_job + steps: + - checkout + - *restore_pip_cache + - *restore_venv_cache + - run: + name: Run the linter + command: make lint + - store_test_results: + path: build/test + + test: + << : *default_job + steps: + - checkout + - *restore_pip_cache + - *restore_venv_cache + - run: + name: Run the tests + command: make test-only + - store_test_results: + path: build/test + - store_artifacts: + path: build/coverage/coverage.xml + destination: coverage + - run: + name: Upload coverage to Codecov + command: bash <(curl -s https://codecov.io/bash) -t ${CODECOV_TOKEN} -f build/coverage/coverage.xml || echo "Codecov did not collect coverage reports" + +workflows: + version: 2 + test: + jobs: + - prepare_cache + - lint: + requires: + - prepare_cache + - test: + requires: + - prepare_cache diff --git a/.python-version b/.python-version index 424e179..c70edfa 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.6.8 +3.11.13 diff --git a/.travis.yml b/.travis.yml index 1ef825f..bb6175b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,9 @@ language: python python: - - "3.6" + - "3.11" # command to install dependencies install: - make dev # command to run tests script: - - make tests + - make test diff --git a/Makefile b/Makefile index b16471c..5e64d1f 100644 --- a/Makefile +++ b/Makefile @@ -1,27 +1,31 @@ -# Some simple testing tasks (sorry, UNIX only). - -PYTHON=venv/bin/python3 +PYTHON=venv/bin/python PIP=venv/bin/pip -NOSE=venv/bin/nosetests +NOSE=venv/bin/pynose FLAKE=venv/bin/flake8 PYPICLOUD_HOST=pypicloud.getkeepsafe.local -PIP_ARGS=--extra-index=http://$(PYPICLOUD_HOST)/simple/ --trusted-host $(PYPICLOUD_HOST) +PIP_ARGS=--extra-index-url http://$(PYPICLOUD_HOST)/simple/ --trusted-host $(PYPICLOUD_HOST) TWINE=./venv/bin/twine +PYNOSE_SHARED_FLAGS=-s --with-coverage --cover-inclusive --cover-erase --cover-package=validator tests +PYNOSE_FLAGS=$(PYNOSE_SHARED_FLAGS) +ifdef CI +PYNOSE_FLAGS += --cover-xml --cover-xml-file=build/coverage/coverage.xml --with-xunit --xunit-file=build/test/results.xml +endif FLAGS= -update: - $(PIP) install -U pip - $(PIP) install $(PIP_ARGS) -U . +build-dir: + mkdir -p build/test build/coverage env: - test -d venv || python3 -m venv venv + test -d venv || python3.11 -m venv venv + $(PIP) install -U pip setuptools wheel + $(PIP) install $(PIP_ARGS) -e . -dev: env update - $(PIP) install $(PIP_ARGS) .[tests,devtools] +dev: env + $(PIP) install $(PIP_ARGS) -e '.[dev]' -install: env update +install: env -publish: +publish: dev rm -rf dist $(PYTHON) -m build . $(TWINE) upload --verbose --sign --username developer --repository-url http://$(PYPICLOUD_HOST)/simple/ dist/*.whl @@ -29,16 +33,43 @@ publish: flake: $(FLAKE) validator tests -test: flake - $(NOSE) -s $(FLAGS) +check-msgpack: + @true + +lint: build-dir flake check-msgpack + +test-only: build-dir + $(NOSE) $(PYNOSE_FLAGS) $(FLAGS) -vtest: - $(NOSE) -s -v $(FLAGS) +test: lint test-only -cov cover coverage: - $(NOSE) -s --with-cover --cover-html --cover-html-dir ./coverage $(FLAGS) +vtest vtests: build-dir + $(NOSE) -v $(PYNOSE_FLAGS) $(FLAGS) + +cov cover coverage: build-dir + $(NOSE) $(PYNOSE_FLAGS) --cover-html --cover-html-dir ./coverage $(FLAGS) echo "open file://`pwd`/coverage/index.html" +ci-env: + @if [ -d "venv" ] && $(PIP) --version >/dev/null 2>&1; then \ + echo "Reusing cached CI venv, no need to recreate when it hasn't changed"; \ + else \ + echo "No cached venv found, creating fresh venv..."; \ + if [ -d "venv" ]; then rm -rf venv; fi; \ + python3.11 -m venv venv; \ + $(PIP) install -U pip setuptools wheel; \ + fi + +ci-dev-install: ci-env + $(PIP) install $(PIP_ARGS) -e '.[dev]' + +hooks: + cp git_hooks/pre-push `git rev-parse --git-path hooks/pre-push` + chmod +x `git rev-parse --git-path hooks/pre-push` + +unhooks: + rm -f `git rev-parse --git-path hooks/pre-push` + clean: rm -rf `find . -name __pycache__` rm -f `find . -type f -name '*.py[co]' ` @@ -51,7 +82,10 @@ clean: rm -f .coverage rm -rf coverage rm -rf build + rm -rf dist + rm -rf *.egg-info rm -rf venv -.PHONY: all build env linux run pep test vtest testloop cov clean +.PHONY: build-dir env dev install publish flake check-msgpack lint test-only test vtest vtests cov cover coverage ci-env \ + ci-dev-install hooks unhooks clean diff --git a/README.md b/README.md index f75c5d8..2db332a 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -content-validator [![Build Status](https://travis-ci.org/KeepSafe/content-validator.svg?branch=master)](https://travis-ci.org/KeepSafe/content-validator) +content-validator [![Build Status](https://travis-ci.org/KeepSafe/content-validator.svg?branch=master)](https://travis-ci.org/KeepSafe/content-validator) [![CircleCI](https://circleci.com/gh/KeepSafe/content-validator.svg?style=svg)](https://circleci.com/gh/KeepSafe/content-validator) ================= Content validator looks at text content and preforms different validation tasks. ## Requirements -1. Python 3.6.6+ +1. Python 3.11 ## Installation @@ -16,16 +16,23 @@ Content validator looks at text content and preforms different validation tasks. `make env` `make dev` +Common local commands: + +* `make env` - create `venv` and install the package runtime dependencies. +* `make dev` - install runtime and development/test dependencies. +* `make lint` - run flake8 using `pyproject.toml` configuration. +* `make test` - run lint and the test suite. +* `make coverage` - run tests with coverage and write the HTML report to `coverage/`. +* `make clean` - remove local build, coverage, cache, and virtualenv artifacts. + ## Usage Generally it's easiest to write a separate test for each validation case. The simplest example: ``` -f = files('src/**/*.txt') -parser = create_parser(Filetype.txt) -reporter = ConsoleReporter() -check = urls(Filetype.txt) -result = validator.validate(checks=[check], files=f, parser=parser, reporter=reporter) +import validator + +result = validator.parse().files('src/**/*.txt').check().url().validate() self.assertEqual([], result) ``` @@ -41,34 +48,41 @@ In case you are not doing any comparison checks you can use a usual glob like pa When the file is first read it the data you want to validate needs to be extracted from it. The simplest example is a text file. Nothing is done here except reading the file content. The more complex example is when, for eg., you have embedded markdown in an xml tag. To extract the data you should create a chain of parsers. First you want to extract all tags from the xml. Second you want to parse the content of the tags from markdown to html. Here is an example how to do that: -`chain_parsers([Filetype.xml, Filetype.md], query='//strings')` +`validator.parse().files('src/{lang}/*.xml', lang='en').xml(query='.//string').md().check().md().validate()` -The xml parser takes additional parameter `query` used to extract the tags. You can pass it to `create_parser` in the same way: +The xml parser takes additional parameter `query` used to extract the tags: -`create_parser(Filetype.xml, query='//strings')` +`validator.parse().files('src/{lang}/*.xml', lang='en').xml(query='.//string')` Available parsers types: -* `Filetype.txt` - simply reads the file -* `Filetype.md` - converts markdown to -* `Filetype.xml` - extracts text from xml and concatenates -* `Filetype.csv` - puts every value on a separate line +* `files(...).check()` - simply reads the file +* `.md()` - converts markdown to HTML +* `.xml(query='*')` - extracts text from XML and concatenates matching elements +* `.csv()` - puts every value on a separate line ### Reporters Shows the result of the validation. There are 2 reporters available: * `HtmlReporter` - creates an error file for every error -* `ConsoleReporter` - print the error to the console +* `ConsoleReporter` - prints the error to the console ### Checks -Checks perform validation on the content. Wether it's url or structure or anything else. If the content in not valid the check will return an error which later can be passed to a reporter. +Checks perform validation on the content. Whether it's url or structure or anything else. If the content is not valid the +check will return an error which later can be passed to a reporter. Available checks: -* `urls(filetype, skip_images=False)` - validates if the url is accessible -* `markdown()` - validates markdown structure by comparing it with the base +* `.url(skip_images=False)` - validates if the url is accessible +* `.md()` - validates markdown structure by comparing it with the base +* `.java()` - validates Java placeholder/reference compatibility + +## CLI + +The package exposes a `content-validator` command. The current CLI is intentionally minimal; use `content-validator --help` +or `content-validator --version` for smoke checks, and use the Python API for validations. ## Example @@ -78,18 +92,23 @@ A more detailed example looks like this: class TestEmail(TestCase): def test_email(self): - f = files('src/{lang}/*.xml', lang='en') - parser = create_parser(Filetype.xml, query='.//string') - reporter = HtmlReporter() - md = markdown() - result = validator.validate(checks=[md], files=f, parser=parser, reporter=reporter) - self.assertEqual({}, v.validate()) + result = validator \ + .parse() \ + .files('src/{lang}/*.xml', lang='en') \ + .xml(query='.//string') \ + .check() \ + .md() \ + .validate() + self.assertEqual([], result) def test_urls(self): - f = files('src/{lang}/*.xml', lang='en') - parser = chain_parsers([Filetype.xml, Filetype.md], query='.//string') - reporter = ConsoleReporter() - check = urls(Filetype.html, skip_images=True) - result = Validator(checks=[check], files=f, parser=parser, reporter=reporter) - self.assertEqual({}, v.validate()) + result = validator \ + .parse() \ + .files('src/{lang}/*.xml', lang='en') \ + .xml(query='.//string') \ + .md() \ + .check() \ + .url(skip_images=True) \ + .validate() + self.assertEqual([], result) ``` diff --git a/docs/python311-migration-contract.md b/docs/python311-migration-contract.md new file mode 100644 index 0000000..3d3ea52 --- /dev/null +++ b/docs/python311-migration-contract.md @@ -0,0 +1,195 @@ +# content-validator Python 3.11 Migration Contract + +## Scope + +`content-validator` is a no-stack library/tool migration. The repo exposes a Python package named `validator` and a +`content-validator` console entry point, but it has no PasteDeploy app factory, Gunicorn config, service INI files, +health endpoint, worker process, or local service dependencies. Service-only `python311-service-upgrade-stack` tasks are +therefore intentionally not part of this migration unless a later audit proves new service shape. + +Edit root: `/Users/olmos/keepsafe/repos/worktrees/content-validator-python-upgrade` +Branch plan: single reviewable branch, `python311-upgrade`, created from `master`. +First write scope: this contract, packaging/test workflow, and golden compatibility tests. + +## Public Surfaces + +- Python API: `validator.parse()` builder chain and `validator.checks` helpers for markdown, URL, URL occurrence, and + Java argument validation. +- Importable modules used downstream: `validator`, `validator.errors`, `validator.checks.url`. +- CLI surface: `content-validator` console script declared by package metadata. +- Behavior-sensitive fixtures: existing markdown, URL occurrence, parser bug, flat text, Java placeholder, and report + fixtures under `tests/fixtures/`. + +## Downstream Consumers + +Local scan found these active consumers: + +- `email-service`: depends on `content-validator` through `setup.py` / requirements and imports `validator` in + `mailman/cms/validation.py`. +- `translation-real-time-validaton`: pins `content-validator == 0.7.2`, imports `validator`, and directly imports + `validator.checks.url.DEFAULT_USER_AGENT`, `TextUrlExtractor`, `UrlStatusChecker`, and `UrlDiff`. +- `ansible`: installs `content-validator` for the `zendesk-knowledgebase-editor` role. + +Downstream unblocked criteria: package installs on Python 3.11, imports the documented modules, preserves fixture-backed +validation behavior, and publishes dependency pins that downstream requirement compilers can resolve. + +## Baseline Audit + +- Packaging: legacy `setup.py` / `setup.cfg`, version `0.7.2`, `.python-version` is `3.6.8`. +- Runtime dependencies: `aiohttp >=3, <3.4`, `Markdown`, `parse <= 1.8.2`, `beautifulsoup4 >=4, <5`, `lxml >=3`, + and direct tagged dependency `sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@0.4.1`. +- Dev/test dependencies: `nose`, `flake8==3.6.0`, `coverage`, `twine`, `build`. +- Test runner: Makefile uses `nosetests`; Travis calls non-existent `make tests`. +- Service audit: no service shape found. +- Python 3.11 baseline result before migration: + - `make test` failed because `flake8==3.6.0` imports removed `pkg_resources` with current setuptools. + - `venv/bin/nosetests -s` failed because `nose` references removed `collections.Callable`. + - `import aiohttp` failed because `aiohttp<3.4` references removed `asyncio.coroutines._DEBUG`. + +## Task Mapping + +| Skill task | Applicability | Plan | +| --- | --- | --- | +| Task 1: pyproject, Python 3.11, dependency audit, pyupgrade | Applicable | Replace legacy packaging with `pyproject.toml`, set Python 3.11 policy, bump version to `1.0.0`, hard-pin runtime deps, upgrade Python 3.11-incompatible deps, and run pyupgrade. | +| Task 2a: formatting and Flake8 alignment | Applicable | Keep 120-char style in `pyproject.toml`, use `flake8-pyproject`, and fix lint only as needed. | +| Task 2b: hooks, CI, Makefile, README | Partial | Normalize Makefile, README, existing Travis workflow, and add CircleCI for this package. Service runner targets are not required for this no-stack repo. | +| Task 2c: mypy stabilization | Not applicable | No existing typing contract or service baseline requires mypy for this small library in this migration. | +| Task 3: msgpack/redis/asynctest/nose | Partial | Replace `nose` with `pynose`. No `msgpack` dependency exists in `pyproject.toml`, and no redis, aioredis, or asynctest usage exists. | +| Task 4: asyncio/aiohttp modernization | Applicable | Upgrade `aiohttp`, keep URL checker request behavior, and remove Python 3.11-incompatible loop usage in tests. | +| Task 4c: async test harness modernization | Applicable | Keep the lightweight local helper but remove removed `loop=` APIs and validate async URL tests. | +| Task 5: Gunicorn/Docker local infra | Not applicable | No service runtime or local backing services. | +| Task 6: requirements build pipeline | Partial | Keep simple package requirements files aligned with `pyproject.toml`; no ansible service lockfile pipeline applies. | + +## Skill Guardrail Audit + +- Preflight: `python3.11`, `rg`, and `gh` were present. Docker/service daemon checks were not run because the repo audit + classified this as no-stack library/tool shape. +- Branching: used the requested single `python311-upgrade` branch for this no-stack repo instead of six stacked service + PRs. The branch split is documented here. +- Service contract tooling: the bundled service contract verifier and CI gate require service-template assertions. They + are not applicable to this package because there is no Paste/Gunicorn/INI/healthcheck service surface to validate. +- `libks==1.0.0`: not applicable; `content-validator` does not depend on `libks`. +- `LIBKS_VERSION` Makefile extraction: not applicable for the same reason. +- CircleCI sample addition: applicable by team request. The branch adapts the skill sample + `resources/python-services/samples/circleci_config.yml` into `.circleci/config.yml` with `prepare_cache`, `lint`, + and `test` jobs, `cimg/python:3.11.13`, sample-style `v3-pip-` / `v3-venv-` fallback cache keys, xUnit/coverage XML + artifact storage, and the sample non-fatal Codecov upload step. The install job uses this repo's `make ci-dev-install` + target without the sample's libks-specific SSH install. +- `PYNOSE_SHARED_FLAGS`: applicable. Makefile test flow uses the sample-style coverage-inclusive pynose flags and adds + CI XML/xunit artifact flags under `ifdef CI`. +- Runtime `print()` guardrail: `ConsoleReporter` intentionally prints user-facing report output for a library reporter, + not long-running service runtime output. + +## Dependency Audit Notes + +Upgraded because Python 3.11 compatibility or modern tooling required it: + +- `aiohttp >=3,<3.4` / `aiohttp==3.1.3` -> `aiohttp==3.13.5`: old versions fail to import on Python 3.11 due removed + `asyncio.coroutines._DEBUG`. +- `beautifulsoup4 >=4,<5` / `beautifulsoup4==4.4.1` -> `beautifulsoup4==4.14.3`: selected as the current Python + 3.11-compatible package set and covered by existing HTML/URL fixture tests. +- `lxml >=3` / `lxml==3.5` -> `lxml==6.1.0`: old pin lacks the target Python 3.11 wheel/runtime baseline; parser and + reporter fixtures cover the exercised behavior. +- `Markdown` / unpinned -> `Markdown==3.10.2`: pinned to the resolved Python 3.11-compatible runtime set and covered by + markdown diff fixtures. +- `parse <= 1.8.2` / `parse==1.8.2` -> `parse==1.22.0`: latest available version passed parser, URL, and fixture + coverage locally. +- `sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@0.4.1` -> + `sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@1.0.0`: latest tagged version installed, imported, + and passed existing HTML/markdown structure-diff fixture coverage locally. +- `flake8==3.6.0` -> `flake8==7.3.0` plus `flake8-pyproject==1.2.4`: old flake8 fails with modern setuptools because + `pkg_resources` is no longer available by default. +- `nose` -> `pynose==1.5.5`: old nose fails on Python 3.11 due removed `collections.Callable`. + +Latest-version audit on 2026-05-13: + +- Current/latest: `aiohttp==3.13.5`, `beautifulsoup4==4.14.3`, `lxml==6.1.0`, `Markdown==3.10.2`, + `parse==1.22.0`, `build==1.5.0`, `coverage==7.14.0`, `flake8==7.3.0`, `flake8-pyproject==1.2.4`, + `pynose==1.5.5`, `pyupgrade==3.21.2`, `twine==6.2.0`, `setuptools>=82.0.1`, and `wheel>=0.47.0`. +- Current/latest git tag: `sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@1.0.0`. +- Msgpack: not applicable. `msgpack` is not a `content-validator` dependency and there are no direct source/test + msgpack call sites to migrate. + +## Proof Commands + +Default proof must be local/free and must not call production, paid providers, or public external service APIs. + +- `python3.11 -m venv venv` +- `venv/bin/pip install -e '.[dev]'` +- `make lint` +- `make test` +- `make ci-dev-install` +- `CI=1 make test` +- `circleci config validate .circleci/config.yml` +- `venv/bin/python -m compileall validator tests` +- Import smoke for `validator`, `validator.checks.url`, `aiohttp`, `bs4`, `lxml`, `markdown`, `parse`, and `sdiff`. +- Golden compatibility tests over existing fixtures, with URL network calls mocked. +- CLI smoke: `venv/bin/content-validator --help`. +- `venv/bin/pip check` + +## Known Gaps + +- Dependency version discovery and installation require package index/GitHub access, but behavior proof must not call + production or paid provider APIs. + +## Migration Results + +Date: 2026-05-11, refreshed against `python311-service-upgrade-stack` on 2026-05-13. + +Branch: `python311-upgrade` + +Completed applicable work: + +- Replaced `setup.py` / `setup.cfg` with `pyproject.toml`. +- Set Python policy to `3.11.13` in `.python-version` and `>=3.11,<3.12` in package metadata. +- Bumped package version from `0.7.2` to `1.0.0`. +- Hard-pinned runtime dependencies in `pyproject.toml` and aligned `requirements.txt`. +- Upgraded Python 3.11-incompatible dependencies: + - `aiohttp >=3,<3.4` / `aiohttp==3.1.3` to `aiohttp==3.13.5`; old import failed on removed + `asyncio.coroutines._DEBUG`. + - `beautifulsoup4` to `4.14.3`, `lxml` to `6.1.0`, `Markdown` to `3.10.2`, `parse` to `1.22.0`, and `sdiff` to + tag `1.0.0` as the latest Python 3.11 package set. + - `flake8==3.6.0` to `flake8==7.3.0` plus `flake8-pyproject==1.2.4`; old flake8 failed on removed + `pkg_resources`. + - `nose` to `pynose==1.5.5`; old nose failed on removed `collections.Callable`. +- Ran pyupgrade ladder through `--py311-plus`. +- Added fixture-backed golden compatibility tests for markdown diff shape, Java placeholders, and URL extraction. +- Added a minimal `content-validator` CLI help/version smoke surface because package metadata already declared the + console script. +- Updated README, Travis command, sample-shaped CircleCI config, Makefile, and git hook target for the Python 3.11 + package workflow. +- Refreshed the Makefile test flow to use sample-style `PYNOSE_SHARED_FLAGS`, including coverage-inclusive defaults and + CI XML/xunit artifact flags under `ifdef CI`. +- Added sample-style CI cache/install targets: `ci-env` reuses a valid cached venv or recreates it, and + `ci-dev-install` installs `.[dev]` through the shared private-index-aware `PIP_ARGS`. +- Added explicit dependency audit notes and latest-version proof for runtime, build, and test pins. + +Proof results: + +- `python3.11 --version`: Python 3.11.13. +- `make clean`: pass. +- `make dev`: pass with package-index/GitHub dependency resolution. +- `make ci-dev-install`: pass with package-index/GitHub dependency resolution after expected sandbox DNS escalation. +- `make test`: pass, 65 tests, 1 skipped, coverage total 84%. +- `CI=1 make test`: pass, 65 tests, 1 skipped, writes `build/coverage/coverage.xml` and `build/test/results.xml`. +- `venv/bin/flake8 --version`: reports `7.3.0` with `Flake8-pyproject: 1.2.4`. +- `venv/bin/pynose --version`: reports `1.5.5`. +- `venv/bin/python -m compileall validator tests`: pass. +- Import smoke for `validator`, `validator.checks.url`, `aiohttp==3.13.5`, `beautifulsoup4==4.14.3`, + `lxml==6.1.0`, `Markdown==3.10.2`, `parse==1.22.0`, and `sdiff==1.0.0`: pass. +- `venv/bin/content-validator --help` and `venv/bin/content-validator --version`: pass. +- `venv/bin/pip check`: pass. +- `venv/bin/python -m build .`: pass; built local sdist and wheel under ignored `dist/`. +- `circleci config validate .circleci/config.yml`: pass; CircleCI CLI reported the config is valid. +- `make hooks`: pass after escalation to write shared git metadata; hook was removed afterward with `make unhooks`. + +Service-only tasks intentionally skipped: + +- No Gunicorn, PasteDeploy, service INI, health endpoint, worker, Docker local infra, or ansible service requirements + pipeline was added. + +Known gaps after migration: + +- Dependency installation/build proof required network access to package indexes and GitHub for the tagged `sdiff` + dependency. +- Downstream repos still need their own requirements recompilation against `content-validator==1.0.0`. diff --git a/git_hooks/pre-push b/git_hooks/pre-push new file mode 100644 index 0000000..04ed983 --- /dev/null +++ b/git_hooks/pre-push @@ -0,0 +1,5 @@ +#!/bin/sh +set -e + +make lint +make test diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2ac33bb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,52 @@ +[build-system] +requires = ["setuptools>=82.0.1", "wheel>=0.47.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "content-validator" +version = "1.0.0" +description = "Content validator looks at text content and performs different validation tasks" +readme = "README.md" +requires-python = ">=3.11,<3.12" +license = "Apache-2.0" +authors = [ + { name = "Keepsafe", email = "support@getkeepsafe.com" }, +] +classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", +] +dependencies = [ + "aiohttp==3.13.5", + "beautifulsoup4==4.14.3", + "lxml==6.1.0", + "Markdown==3.10.2", + "parse==1.22.0", + "sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@1.0.0", +] + +[project.optional-dependencies] +dev = [ + "build==1.5.0", + "coverage==7.14.0", + "flake8==7.3.0", + "flake8-pyproject==1.2.4", + "pynose==1.5.5", + "pyupgrade==3.21.2", + "twine==6.2.0", +] + +[project.scripts] +content-validator = "validator:main" + +[project.urls] +Homepage = "https://github.com/KeepSafe/content-validator/" + +[tool.setuptools.packages.find] +exclude = ["tests"] + +[tool.flake8] +max-line-length = 120 +ignore = ["F403"] diff --git a/requirements-dev.txt b/requirements-dev.txt index d01e4c4..9f929d6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,8 @@ -r requirements.txt -flake8==3.6.0 -nose -coverage +build==1.5.0 +coverage==7.14.0 +flake8==7.3.0 +flake8-pyproject==1.2.4 +pynose==1.5.5 +pyupgrade==3.21.2 +twine==6.2.0 diff --git a/requirements.txt b/requirements.txt index 4036e54..be79f93 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -beautifulsoup4==4.4.1 -Markdown -html2text==2014.12.29 -lxml==3.5 -parse==1.8.2 -aiohttp==3.1.3 +aiohttp==3.13.5 +beautifulsoup4==4.14.3 +lxml==6.1.0 +Markdown==3.10.2 +parse==1.22.0 +sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@1.0.0 diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 0945a29..0000000 --- a/setup.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[metadata] -description-file = README.md - -[flake8] -max-line-length = 120 -ignore = F403 - -[pep8] -max-line-length = 120 diff --git a/setup.py b/setup.py deleted file mode 100644 index 6f1ddb5..0000000 --- a/setup.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from setuptools import setup, find_packages - - -version = '0.7.2' - - -def read(f): - return open(os.path.join(os.path.dirname(__file__), f)).read().strip() - - -install_requires = [ - 'sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@0.4.1#egg=sdiff', - 'aiohttp >=3, <3.4', - 'Markdown', - 'parse <= 1.8.2', - 'beautifulsoup4 >=4, <5', - 'lxml >=3', -] - -tests_require = [ - 'nose', - 'flake8==3.6.0', - 'coverage', -] - -devtools_require = [ - 'twine', - 'build', -] - -setup( - name='content-validator', - version=version, - description=('Content validator looks at text content and preforms different validation tasks'), - classifiers=[ - 'License :: OSI Approved :: BSD License', 'Intended Audience :: Developers', 'Programming Language :: Python' - ], - author='Keepsafe', - author_email='support@getkeepsafe.com', - url='https://github.com/KeepSafe/content-validator/', - license='Apache', - packages=find_packages(exclude=['tests']), - package_data={}, - namespace_packages=[], - install_requires=install_requires, - tests_require=tests_require, - extras_require={ - 'tests': tests_require, - 'devtools': devtools_require, - }, - entry_points={'console_scripts': ['content-validator = validator:main']}, - include_package_data=False) diff --git a/tests/__init__.py b/tests/__init__.py index 7633d8b..fa94b7c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -16,7 +16,7 @@ def coro(self, coro): return self.loop.run_until_complete(coro) def make_fut(self, result): - fut = asyncio.Future(loop=self.loop) + fut = asyncio.Future() fut.set_result(result) return fut diff --git a/tests/test_golden_compatibility.py b/tests/test_golden_compatibility.py new file mode 100644 index 0000000..ab02e43 --- /dev/null +++ b/tests/test_golden_compatibility.py @@ -0,0 +1,41 @@ +from unittest import TestCase + +import validator +from validator.checks.url import TextUrlExtractor + + +class TestGoldenCompatibility(TestCase): + def test_markdown_fixture_diff_shape(self): + errors = validator.parse().files('tests/fixtures/lang/{lang}/test2.md', lang='en').check().md().validate() + + self.assertEqual(1, len(errors)) + error = errors[0] + self.assertEqual('tests/fixtures/lang/en/test2.md', str(error.base.original)) + self.assertEqual('tests/fixtures/lang/de/test2.md', str(error.other.original)) + self.assertEqual([ + 'There is a missing element `text`.', + 'There is a missing element `header`.', + 'There is a missing element `text`.', + 'There is a missing element `paragraph`.', + 'There is a missing element `text`.', + 'There is a missing element `new-line`.', + 'There is an additional element `header`.', + 'There is an additional element `text`.', + 'There is an additional element `paragraph`.', + 'There is an additional element `text`.', + ], list(error.error_msgs)) + + def test_java_placeholder_fixture_shape(self): + same_errors = validator.parse().text('aaa %1.2s aaa', 'bbb %1.2s bbb').check().java().validate() + diff_errors = validator.parse().text('aaa %1.2s aaa', 'bbb bbb').check().java().validate() + + self.assertEqual([], same_errors) + self.assertEqual(1, len(diff_errors)) + self.assertEqual('java args do not match', diff_errors[0].error_msgs) + + def test_url_extractor_fixture_shape(self): + urls = TextUrlExtractor().extract_urls( + 'one http://example.com/a?b=1 two http://{{placeholder}} three http://www.google¡.com' + ) + + self.assertEqual(['http://example.com/a?b=1', 'http://www.google.com'], sorted(urls)) diff --git a/tests/utils.py b/tests/utils.py index 3948220..e3aa96c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,3 @@ - def read(path): with open(path) as fp: return fp.read() diff --git a/validator/__init__.py b/validator/__init__.py index c7301e4..436bc09 100644 --- a/validator/__init__.py +++ b/validator/__init__.py @@ -1,9 +1,12 @@ +import argparse +from importlib.metadata import PackageNotFoundError, version + import sdiff from . import parsers, checks, reports, fs -class Validator(object): +class Validator: def __init__(self, contents, parser, reader, check, reporter=None): self.contents = contents self.parser = parser @@ -24,7 +27,7 @@ async def async_validate(self): return errors -class ReportBuilder(object): +class ReportBuilder: def __init__(self, contents, parser, reader, check): self.contents = contents self.parser = parser @@ -49,7 +52,7 @@ def validate(self): return Validator(self.contents, self.parser, self.reader, self.check, reporter).validate() -class CheckBuilder(object): +class CheckBuilder: def __init__(self, contents, content_type, parser, reader): self.contents = contents self.content_type = content_type @@ -89,7 +92,7 @@ async def async_validate(self): return res -class ParserBuilder(object): +class ParserBuilder: def __init__(self, contents, reader=None): self.contents = contents self.content_type = 'txt' @@ -120,7 +123,7 @@ def check(self): return CheckBuilder(self.contents, self.content_type, parser, self.reader) -class ContentBuilder(object): +class ContentBuilder: def files(self, pattern, **kwargs): contents = fs.files(pattern, **kwargs) return ParserBuilder(contents, parsers.FileReader()) @@ -140,3 +143,17 @@ def text(self, base, other): def parse(): return ContentBuilder() + + +def main(argv=None): + try: + package_version = version('content-validator') + except PackageNotFoundError: + package_version = 'unknown' + parser = argparse.ArgumentParser( + prog='content-validator', + description='Validate translated content with the validator Python API.', + ) + parser.add_argument('--version', action='version', version='%(prog)s {}'.format(package_version)) + parser.parse_args(argv) + return 0 diff --git a/validator/checks/__init__.py b/validator/checks/__init__.py index 9dc692f..61861f2 100644 --- a/validator/checks/__init__.py +++ b/validator/checks/__init__.py @@ -1,5 +1,3 @@ -from typing import Type - from sdiff import MdParser from .md import MarkdownComparator @@ -21,7 +19,7 @@ def url_occurences(filetype): return UrlOccurenciesValidator() -def markdown(filetype, md_parser_cls: Type[MdParser] = MdParser): +def markdown(filetype, md_parser_cls: type[MdParser] = MdParser): if filetype not in ['txt', 'html']: raise UndefinedCheckTypeError('got filetype %s' % filetype) return MarkdownComparator(md_parser_cls) @@ -33,7 +31,7 @@ def java_args(filetype): return JavaComparator() -class ChainCheck(object): +class ChainCheck: def __init__(self, checks): self.checks = checks diff --git a/validator/checks/java.py b/validator/checks/java.py index 319ec2d..70cfd0d 100644 --- a/validator/checks/java.py +++ b/validator/checks/java.py @@ -6,7 +6,7 @@ REF_PATTERN = r'@string/\w+' -class JavaComparator(object): +class JavaComparator: def _get_args(self, content): return re.findall(ARG_PATTERN, content) diff --git a/validator/checks/md.py b/validator/checks/md.py index d8542a8..70e6988 100644 --- a/validator/checks/md.py +++ b/validator/checks/md.py @@ -1,5 +1,4 @@ import re -from typing import Type from sdiff import diff, renderer, MdParser from markdown import markdown @@ -14,8 +13,8 @@ def save_file(content, filename): fp.write(content) -class MarkdownComparator(object): - def __init__(self, md_parser_cls: Type[MdParser] = MdParser): +class MarkdownComparator: + def __init__(self, md_parser_cls: type[MdParser] = MdParser): self._md_parser_cls = md_parser_cls def check(self, data, parser, reader): diff --git a/validator/checks/url.py b/validator/checks/url.py index dabd816..263a532 100644 --- a/validator/checks/url.py +++ b/validator/checks/url.py @@ -5,7 +5,6 @@ import string from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin -from typing import List, Optional from ..errors import UrlDiff, UrlOccurencyDiff @@ -23,7 +22,7 @@ class MissingUrlExtractorError(Exception): # the job of extractors is to find all non-parametrized urls in the given text for later checks via UrlValidator # which examines is particular url leads to working webpage (200 status) # since we are interested in all urls (including parametrized) we need to sligthly change their API and behaviour -class TextUrlExtractor(object): +class TextUrlExtractor: def __init__(self, **kwargs): pass @@ -60,12 +59,12 @@ def _validate_email(self, email): return False def _extract_from_anchors(self, soup): - return set([a.get('href') or a.text for a in soup.find_all('a')]) + return {a.get('href') or a.text for a in soup.find_all('a')} def _extract_from_img(self, soup): if self.skip_images: return set() - return set([img.get('src') for img in soup.find_all('img')]) + return {img.get('src') for img in soup.find_all('img')} def _fix_url(self, url): result = '' @@ -82,7 +81,7 @@ def _fix_url(self, url): if re.match(self.url_pattern, full_url): result = full_url else: - logging.error('{} not tested'.format(url_parsed.geturl())) + logging.error(f'{url_parsed.geturl()} not tested') return result def extract_urls(self, content, keep_placeholders=False): @@ -96,20 +95,20 @@ def extract_urls(self, content, keep_placeholders=False): return result -class UrlStatusChecker(object): +class UrlStatusChecker: retry_max_count = 3 - def __init__(self, headers=None, exclude_urls_regexs: Optional[List[str]] = None): + def __init__(self, headers=None, exclude_urls_regexs: list[str] | None = None): self._exclude_urls_regex = exclude_urls_regexs or [] if self._exclude_urls_regex: - logging.warning('Excluded urls regexps: {}'.format(self._exclude_urls_regex)) + logging.warning(f'Excluded urls regexps: {self._exclude_urls_regex}') self._headers = headers or {} if 'User-Agent' not in self._headers: self._headers['User-Agent'] = DEFAULT_USER_AGENT async def _make_request(self, url): try: - logging.info('checking {}'.format(url)) + logging.info(f'checking {url}') async with aiohttp.request('get', url, headers=self._headers, allow_redirects=True) as res: return res.status except Exception: @@ -143,7 +142,7 @@ async def _check_urls_coro(self, urls, future): if not is_exluded: urls_without_excluded.append(url) else: - logging.warning('url {} excluded from status check'.format(url.url)) + logging.warning(f'url {url.url} excluded from status check') tasks = [self._request_status_code(url.url) for url in urls_without_excluded] results = await asyncio.gather(*tasks) for index, url in enumerate(urls_without_excluded): @@ -167,10 +166,10 @@ async def async_check(self, urls): return future.result() -class UrlValidator(object): +class UrlValidator: _extractors = {'txt': TextUrlExtractor, 'html': HtmlUrlExtractor} - def __init__(self, filetype, headers=None, exclude_status_check_regexs: Optional[List[str]] = None, **kwargs): + def __init__(self, filetype, headers=None, exclude_status_check_regexs: list[str] | None = None, **kwargs): self.client_headers = headers or {} self._excluded_status_check_regexs = exclude_status_check_regexs or [] extractor_class = self._extractors.get(filetype) @@ -179,7 +178,7 @@ def __init__(self, filetype, headers=None, exclude_status_check_regexs: Optional self.extractor = extractor_class(**kwargs) def _get_urls(self, data, parser, reader): - flat_data = set(p for sublist in data for p in sublist) + flat_data = {p for sublist in data for p in sublist} # TODO yield instead urls = {} for element in flat_data: diff --git a/validator/errors.py b/validator/errors.py index 082885a..575bc08 100644 --- a/validator/errors.py +++ b/validator/errors.py @@ -1,7 +1,7 @@ from collections import namedtuple -class UrlDiff(object): +class UrlDiff: def __init__(self, url, files=None, status_code=200, has_disallowed_chars=False): self.url = url @@ -37,7 +37,7 @@ def is_valid(self): ContentData.__new__.__defaults__ = ('', ) * 2 -class MdDiff(object): +class MdDiff: def __init__(self, base, other, error_msgs): self.base = base diff --git a/validator/fs.py b/validator/fs.py index b87808d..6e12b16 100644 --- a/validator/fs.py +++ b/validator/fs.py @@ -91,10 +91,10 @@ def files(pattern, **kwargs): [[Path(path/to1/file1.txt), Path(path/to1/file2.txt)], [Path(path/to2/file1.txt), Path(path/to2/file2.txt)]] """ # extract named parameters from the pattern - params = set([p for p in map(lambda e: e[1], Formatter().parse(pattern)) if p]) + params = {p for p in map(lambda e: e[1], Formatter().parse(pattern)) if p} if params: if len(params - kwargs.keys()) > 0: - raise ValueError('missing parameters {} for pattern {}'.format(params - kwargs.keys(), pattern)) + raise ValueError(f'missing parameters {params - kwargs.keys()} for pattern {pattern}') return _params_pattern(pattern, params, **kwargs) else: return _no_params_pattern(pattern) diff --git a/validator/parsers.py b/validator/parsers.py index 379902b..3c87b89 100644 --- a/validator/parsers.py +++ b/validator/parsers.py @@ -9,22 +9,22 @@ def __init__(self, msg): super().__init__(msg) -class FileReader(object): +class FileReader: def read(self, path): return read_content(path) -class TxtReader(object): +class TxtReader: def read(self, content): return content -class MarkdownParser(object): +class MarkdownParser: def parse(self, content): return markdown.markdown(content) -class XmlParser(object): +class XmlParser: def __init__(self, query='*'): self.query = query @@ -38,12 +38,12 @@ def parse(self, content): return '\n\n'.join(texts) -class CsvParser(object): +class CsvParser: def parse(self, content): return '\n'.join(content.split(',')) -class ChainParser(object): +class ChainParser: def __init__(self, parsers): self.parsers = parsers diff --git a/validator/reports.py b/validator/reports.py index c052e62..55b43b2 100644 --- a/validator/reports.py +++ b/validator/reports.py @@ -6,7 +6,7 @@ from .errors import UrlDiff, MdDiff, UrlOccurencyDiff -class HtmlReporter(object): +class HtmlReporter: report_template = """ @@ -82,7 +82,7 @@ def __init__(self, output_directory='errors'): self.output_directory = output_directory def _add_content(self, soup, tag_id, content): - tags = soup.select('#{}'.format(tag_id)) + tags = soup.select(f'#{tag_id}') if tags and content: tags[0].append(content) @@ -99,7 +99,7 @@ def report(self, errors): # TODO use mustache for templates report_soup = BeautifulSoup(self.report_template, 'lxml') if isinstance(error, UrlDiff): - messages = ['{} returned with code {}'.format(error.url, error.status_code)] + messages = [f'{error.url} returned with code {error.status_code}'] self._add_content(report_soup, 'urls', '\n'.join(messages)) if isinstance(error, MdDiff): error_msgs = '
'.join(map(lambda i: str(i), error.error_msgs)) @@ -113,20 +113,20 @@ def report(self, errors): save_report(self.output_directory, error.other.original, report_soup.prettify()) -class ConsoleReporter(object): +class ConsoleReporter: def report(self, errors): for error in errors: if isinstance(error, UrlDiff): - print('{} returned with code {}'.format(error.url, error.status_code)) + print(f'{error.url} returned with code {error.status_code}') for path in error.files: - print('\t{}'.format(str(path))) + print(f'\t{str(path)}') print() if isinstance(error, MdDiff): - print('Files are different:\n\t{}\n\t{}\n\n'.format(str(error.base), str(error.other))) + print(f'Files are different:\n\t{str(error.base)}\n\t{str(error.other)}\n\n') -class StoreReporter(object): +class StoreReporter: def __init__(self): self.log = [] @@ -143,7 +143,7 @@ def report(self, errors): self.log.append('Count of URLS in %s and %s are different' % (error.base_path, error.translation_path)) -class ChainReporter(object): +class ChainReporter: def __init__(self, reporters): self.reporters = reporters