From 5ffc0d13a2d38050ba44c2efd848910d87ceb57e Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 20 Nov 2023 11:58:01 -0800 Subject: [PATCH] Migrate linter from `pylint` to `ruff` (#1665) --- .github/workflows/{pylint.yml => ruff.yml} | 10 +- .pylintrc | 434 ------------------ benchmarks/benchmark_throughput.py | 5 +- format.sh | 16 +- pyproject.toml | 24 + requirements-dev.txt | 2 +- setup.py | 14 +- tests/async_engine/api_server_async_engine.py | 1 - tests/async_engine/test_api_server.py | 7 +- tests/conftest.py | 1 - tests/engine/test_detokenize.py | 7 +- tests/kernels/test_attention.py | 2 +- tests/samplers/test_sampler.py | 9 +- tests/worker/test_worker.py | 1 - vllm/core/scheduler.py | 2 +- vllm/engine/llm_engine.py | 4 +- vllm/engine/ray_utils.py | 3 +- vllm/entrypoints/llm.py | 20 +- vllm/entrypoints/openai/api_server.py | 8 +- vllm/model_executor/layers/activation.py | 21 +- vllm/model_executor/layers/attention.py | 1 - .../model_executor/layers/quantization/awq.py | 2 +- vllm/model_executor/model_loader.py | 2 +- vllm/model_executor/models/aquila.py | 5 +- vllm/model_executor/models/baichuan.py | 5 +- vllm/model_executor/models/bloom.py | 5 +- vllm/model_executor/models/chatglm.py | 5 +- vllm/model_executor/models/falcon.py | 5 +- vllm/model_executor/models/gpt2.py | 5 +- vllm/model_executor/models/gpt_bigcode.py | 5 +- vllm/model_executor/models/gpt_j.py | 10 +- vllm/model_executor/models/gpt_neox.py | 5 +- vllm/model_executor/models/internlm.py | 5 +- vllm/model_executor/models/llama.py | 5 +- vllm/model_executor/models/mistral.py | 5 +- vllm/model_executor/models/mpt.py | 13 +- vllm/model_executor/models/opt.py | 5 +- vllm/model_executor/models/phi_1_5.py | 5 +- vllm/model_executor/models/qwen.py | 5 +- vllm/model_executor/models/yi.py | 5 +- vllm/model_executor/weight_utils.py | 10 +- vllm/transformers_utils/config.py | 2 +- vllm/transformers_utils/configs/mpt.py | 16 +- vllm/utils.py | 2 +- vllm/worker/worker.py | 5 +- 45 files changed, 122 insertions(+), 607 deletions(-) rename .github/workflows/{pylint.yml => ruff.yml} (81%) delete mode 100644 .pylintrc diff --git a/.github/workflows/pylint.yml b/.github/workflows/ruff.yml similarity index 81% rename from .github/workflows/pylint.yml rename to .github/workflows/ruff.yml index 1c810adbe3ef4..bd38d11872dc4 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/ruff.yml @@ -1,4 +1,4 @@ -name: pylint +name: ruff on: # Trigger the workflow on push or pull request, @@ -11,7 +11,7 @@ on: - main jobs: - pylint: + ruff: runs-on: ubuntu-latest strategy: matrix: @@ -25,7 +25,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pylint==2.8.2 - - name: Analysing the code with pylint + pip install ruff==0.1.5 + - name: Analysing the code with ruff run: | - pylint vllm tests + ruff vllm tests diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index f85ab742bec30..0000000000000 --- a/.pylintrc +++ /dev/null @@ -1,434 +0,0 @@ -# This Pylint rcfile contains a best-effort configuration to uphold the -# best-practices and style described in the Google Python style guide: -# https://google.github.io/styleguide/pyguide.html -# -# Its canonical open-source location is: -# https://google.github.io/styleguide/pylintrc - -[MASTER] - -# Files or directories to be skipped. They should be base names, not paths. -ignore=docs - -# Files or directories matching the regex patterns are skipped. The regex -# matches against base names, not paths. -ignore-patterns= - -# Pickle collected data for later comparisons. -persistent=no - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Use multiple processes to speed up Pylint. -jobs=4 - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -#enable= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -disable=abstract-method, - apply-builtin, - arguments-differ, - attribute-defined-outside-init, - backtick, - bad-option-value, - basestring-builtin, - buffer-builtin, - c-extension-no-member, - consider-using-enumerate, - cmp-builtin, - cmp-method, - coerce-builtin, - coerce-method, - delslice-method, - div-method, - duplicate-code, - eq-without-hash, - execfile-builtin, - file-builtin, - filter-builtin-not-iterating, - fixme, - getslice-method, - global-statement, - hex-method, - idiv-method, - implicit-str-concat-in-sequence, - import-error, - import-self, - import-star-module-level, - inconsistent-return-statements, - input-builtin, - intern-builtin, - invalid-str-codec, - locally-disabled, - logging-fstring-interpolation, # added by vLLM - logging-not-lazy, # added by vLLM - long-builtin, - long-suffix, - map-builtin-not-iterating, - misplaced-comparison-constant, - missing-class-docstring, # TODO (vLLM): enable - missing-function-docstring, - missing-module-docstring, # TODO (vLLM): enable - metaclass-assignment, - next-method-called, - next-method-defined, - no-absolute-import, - no-else-break, - no-else-continue, - no-else-raise, - no-else-return, - no-init, # added - no-member, - no-name-in-module, - no-self-use, - nonzero-method, - oct-method, - old-division, - old-ne-operator, - old-octal-literal, - old-raise-syntax, - parameter-unpacking, - print-statement, - raising-string, - range-builtin-not-iterating, - raw_input-builtin, - rdiv-method, - reduce-builtin, - relative-import, - reload-builtin, - round-builtin, - setslice-method, - signature-differs, - standarderror-builtin, - suppressed-message, - sys-max-int, - too-few-public-methods, - too-many-ancestors, - too-many-arguments, - too-many-boolean-expressions, - too-many-branches, - too-many-instance-attributes, - too-many-locals, - too-many-nested-blocks, - too-many-public-methods, - too-many-return-statements, - too-many-statements, - trailing-newlines, - unichr-builtin, - unicode-builtin, - unnecessary-pass, - unpacking-in-except, - unspecified-encoding, - useless-else-on-loop, - useless-object-inheritance, - useless-suppression, - using-cmp-argument, - wrong-import-order, - xrange-builtin, - zip-builtin-not-iterating, - - -[REPORTS] - -# Set the output format. Available formats are text, parseable, colorized, msvs -# (visual studio) and html. You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages -reports=no - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - - -[BASIC] - -# Good variable names which should always be accepted, separated by a comma -good-names=main,_ - -# Bad variable names which should always be refused, separated by a comma -bad-names= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl - -# Regular expression matching correct function names -function-rgx=^(?:(?PsetUp|tearDown|setUpModule|tearDownModule)|(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ - -# Regular expression matching correct variable names -variable-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct constant names -const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression matching correct attribute names -attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ - -# Regular expression matching correct argument names -argument-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression matching correct class names -class-rgx=^_?[A-Z][a-zA-Z0-9]*$ - -# Regular expression matching correct module names -module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$ - -# Regular expression matching correct method names -method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=10 - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=80 - -# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt -# lines made too long by directives to pytype. - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=(?x)( - ^\s*(\#\ )??$| - ^\s*(from\s+\S+\s+)?import\s+.+$) - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=yes - -# Maximum number of lines in a module -max-module-lines=99999 - -# String used as indentation unit. The internal Google style guide mandates 2 -# spaces. Google's externaly-published style guide says 4, consistent with -# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google -# projects (like TensorFlow). -indent-string=' ' - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=TODO - - -[STRING] - -# This flag controls whether inconsistent-quotes generates a warning when the -# character used as a quote delimiter is used inconsistently within a module. -check-quote-consistency=yes - - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging,absl.logging,tensorflow.io.logging - - -[SIMILARITIES] - -# Minimum lines number of a similarity. -min-similarity-lines=4 - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[IMPORTS] - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub, - TERMIOS, - Bastion, - rexec, - sets - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant, absl - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls, - class_ - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=StandardError, - Exception, - BaseException diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 3d5530ecb315b..22c8112c40ab6 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -17,9 +17,8 @@ def sample_requests( tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], ) -> List[Tuple[str, int, int]]: - if fixed_output_len is not None: - if fixed_output_len < 4: - raise ValueError("output_len too small") + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") # Load the dataset. with open(dataset_path) as f: diff --git a/format.sh b/format.sh index f4b63ecd66d88..251839893c97d 100755 --- a/format.sh +++ b/format.sh @@ -7,7 +7,7 @@ # # Format files that differ from origin/main. # bash format.sh -# # Commit changed files with message 'Run yapf and pylint' +# # Commit changed files with message 'Run yapf and ruff' # # # YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. @@ -22,7 +22,7 @@ ROOT="$(git rev-parse --show-toplevel)" builtin cd "$ROOT" || exit 1 YAPF_VERSION=$(yapf --version | awk '{print $2}') -PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}') +RUFF_VERSION=$(ruff --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}') # # params: tool name, tool version, required version @@ -34,7 +34,7 @@ tool_version_check() { } tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)" +tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" YAPF_FLAGS=( @@ -95,14 +95,14 @@ echo 'vLLM yapf: Done' # Lint specified files lint() { - pylint "$@" + ruff "$@" } # Lint files that differ from main branch. Ignores dirs that are not slated # for autolint yet. lint_changed() { # The `if` guard ensures that the list of filenames is not empty, which - # could cause pylint to receive 0 positional arguments, making it hang + # could cause ruff to receive 0 positional arguments, making it hang # waiting for STDIN. # # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that @@ -111,13 +111,13 @@ lint_changed() { if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - pylint + ruff fi } -# Run Pylint -echo 'vLLM Pylint:' +# Run Ruff +echo 'vLLM Ruff:' ## This flag lints individual files. --files *must* be the first command line ## arg to use this option. if [[ "$1" == '--files' ]]; then diff --git a/pyproject.toml b/pyproject.toml index 27285bb683565..e3e3e389f7897 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,3 +7,27 @@ requires = [ "wheel", ] build-backend = "setuptools.build_meta" + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + # "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + # "I", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # line too long, handled by black formatting + "E501", +] diff --git a/requirements-dev.txt b/requirements-dev.txt index b78976e8f4161..c9b212c923a42 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,6 @@ # formatting yapf==0.32.0 -pylint==2.8.2 +ruff==0.1.5 # type checking mypy==0.991 diff --git a/setup.py b/setup.py index 806a7192ac9c8..36f4913435628 100644 --- a/setup.py +++ b/setup.py @@ -75,7 +75,8 @@ def get_torch_arch_list() -> Set[str]: f"Unsupported CUDA architectures ({invalid_arch_list}) are " "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " f"({env_arch_list}). Supported CUDA architectures are: " - f"{valid_archs}.") + f"{valid_archs}.", + stacklevel=2) return arch_list @@ -106,10 +107,10 @@ if not compute_capabilities: # Validate the NVCC CUDA version. if nvcc_cuda_version < Version("11.0"): raise RuntimeError("CUDA 11.0 or higher is required to build the package.") -if nvcc_cuda_version < Version("11.1"): - if any(cc.startswith("8.6") for cc in compute_capabilities): - raise RuntimeError( - "CUDA 11.1 or higher is required for compute capability 8.6.") +if (nvcc_cuda_version < Version("11.1") + and any(cc.startswith("8.6") for cc in compute_capabilities)): + raise RuntimeError( + "CUDA 11.1 or higher is required for compute capability 8.6.") if nvcc_cuda_version < Version("11.8"): if any(cc.startswith("8.9") for cc in compute_capabilities): # CUDA 11.8 is required to generate the code targeting compute capability 8.9. @@ -119,7 +120,8 @@ if nvcc_cuda_version < Version("11.8"): # instead of 8.9. warnings.warn( "CUDA 11.8 or higher is required for compute capability 8.9. " - "Targeting compute capability 8.0 instead.") + "Targeting compute capability 8.0 instead.", + stacklevel=2) compute_capabilities = set(cc for cc in compute_capabilities if not cc.startswith("8.9")) compute_capabilities.add("8.0+PTX") diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 515d7a801e9be..1be76fdc8d868 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -14,7 +14,6 @@ app = vllm.entrypoints.api_server.app class AsyncLLMEngineWithStats(AsyncLLMEngine): - # pylint: disable=redefined-outer-name def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._num_aborts = 0 diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 1ca4826b27f3b..d90ba37b27bb9 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -24,7 +24,6 @@ def _query_server(prompt: str) -> dict: def api_server(): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() - # pylint: disable=consider-using-with uvicorn_process = subprocess.Popen([ sys.executable, "-u", str(script_path), "--model", "facebook/opt-125m" @@ -33,7 +32,6 @@ def api_server(): uvicorn_process.terminate() -# pylint: disable=redefined-outer-name, unused-argument def test_api_server(api_server): """ Run the API server and test it. @@ -49,11 +47,10 @@ def test_api_server(api_server): prompts = ["Hello world"] * 1 result = None while not result: - # pylint: disable=bare-except try: - for result in pool.map(_query_server, prompts): + for _ in pool.map(_query_server, prompts): break - except: + except Exception: time.sleep(1) # Actual tests start here diff --git a/tests/conftest.py b/tests/conftest.py index cc4339849f55a..9f0cf5bdc9899 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,6 @@ from vllm import LLM, SamplingParams from vllm.transformers_utils.tokenizer import get_tokenizer _TEST_PROMPTS = [ - # pylint: disable=line-too-long "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", diff --git a/tests/engine/test_detokenize.py b/tests/engine/test_detokenize.py index 0f51af166c4b1..4421739390e3b 100644 --- a/tests/engine/test_detokenize.py +++ b/tests/engine/test_detokenize.py @@ -5,10 +5,9 @@ from transformers import AutoTokenizer from vllm.transformers_utils.tokenizer import detokenize_incrementally TRUTH = [ - # pylint: disable=line-too-long - "Hello here, this is a simple test", - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", - "我很感谢你的热情" + "Hello here, this is a simple test", # noqa: E501 + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa: E501 + "我很感谢你的热情" # noqa: E501 ] TOKENIZERS = [ "facebook/opt-125m", diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 7c4a84d4c7d84..e76416d88311d 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -211,7 +211,7 @@ def test_paged_attention( alibi_slopes, ) else: - assert False, f"Unknown version: {version}" + raise AssertionError(f"Unknown version: {version}") # Run the reference implementation. ref_output = torch.empty_like(query) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index eec0d9ff79728..1df75cd7be14b 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,4 +1,3 @@ -# pylint: disable=protected-access import random from typing import Tuple from unittest.mock import patch @@ -20,10 +19,10 @@ class MockLogitsSampler(Sampler): def forward(self, *args, **kwargs): with patch("vllm.model_executor.layers.sampler._prune_hidden_states", - lambda x, y: x): - with patch("vllm.model_executor.layers.sampler._get_logits", + lambda x, y: x), patch( + "vllm.model_executor.layers.sampler._get_logits", lambda *args, **kwargs: self.fake_logits): - return super().forward(*args, **kwargs) + return super().forward(*args, **kwargs) def _prepare_test( @@ -214,6 +213,6 @@ def test_sampler_logits_processors(seed: int): sampler_output = sampler(embedding=None, hidden_states=input_tensor, input_metadata=input_metadata) - for i, sequence_output in enumerate(sampler_output): + for _, sequence_output in enumerate(sampler_output): for idx, nth_output in enumerate(sequence_output.samples): assert nth_output.output_token == idx diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py index d7ed50acb2295..b2c61e24efdd6 100644 --- a/tests/worker/test_worker.py +++ b/tests/worker/test_worker.py @@ -1,4 +1,3 @@ -# pylint: disable=protected-access import random import torch diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 0c98c063c8694..b6cb91766eae0 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -350,7 +350,7 @@ class Scheduler: elif preemption_mode == PreemptionMode.SWAP: self._preempt_by_swap(seq_group, blocks_to_swap_out) else: - assert False, "Invalid preemption mode." + raise AssertionError("Invalid preemption mode.") def _preempt_by_recompute( self, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 20af3fb3e384b..ec7a4587ffe4b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -125,7 +125,7 @@ class LLMEngine: def _init_workers(self, distributed_init_method: str): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker # pylint: disable=import-outside-toplevel + from vllm.worker.worker import Worker assert self.parallel_config.world_size == 1, ( "Ray is required if parallel_config.world_size > 1.") @@ -148,7 +148,7 @@ class LLMEngine: **ray_remote_kwargs): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker # pylint: disable=import-outside-toplevel + from vllm.worker.worker import Worker self.workers: List[Worker] = [] for bundle in placement_group.bundle_specs: diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index ed7f1ec45e32d..ee58b8b9074a7 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -16,7 +16,6 @@ try: def __init__(self, init_cached_hf_modules=False) -> None: if init_cached_hf_modules: - # pylint: disable=import-outside-toplevel from transformers.dynamic_module_utils import init_hf_modules init_hf_modules() self.worker = None @@ -37,7 +36,7 @@ except ImportError as e: "`pip install ray pandas pyarrow`.") ray = None TorchDistributedWorker = None - RayWorker = None # pylint: disable=invalid-name + RayWorker = None if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9dddfc1acd9cc..b05ba71c6d352 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -134,25 +134,21 @@ class LLM: if isinstance(prompts, str): # Convert a single prompt to a list. prompts = [prompts] - if prompts is not None and prompt_token_ids is not None: - if len(prompts) != len(prompt_token_ids): - raise ValueError("The lengths of prompts and prompt_token_ids " - "must be the same.") + if (prompts is not None and prompt_token_ids is not None + and len(prompts) != len(prompt_token_ids)): + raise ValueError("The lengths of prompts and prompt_token_ids " + "must be the same.") if sampling_params is None: # Use default sampling params. sampling_params = SamplingParams() # Add requests to the engine. - if prompts is not None: - num_requests = len(prompts) - else: - num_requests = len(prompt_token_ids) + num_requests = len(prompts) if prompts is not None else len( + prompt_token_ids) for i in range(num_requests): prompt = prompts[i] if prompts is not None else None - if prompt_token_ids is None: - token_ids = None - else: - token_ids = prompt_token_ids[i] + token_ids = None if prompt_token_ids is None else prompt_token_ids[ + i] self._add_request(prompt, sampling_params, token_ids) return self._run_engine(use_tqdm) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d3db5f47d1f3b..a9c9fbed0cbaa 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -55,7 +55,7 @@ def create_error_response(status_code: HTTPStatus, @app.exception_handler(RequestValidationError) -async def validation_exception_handler(request, exc): # pylint: disable=unused-argument +async def validation_exception_handler(_, exc): return create_error_response(HTTPStatus.BAD_REQUEST, str(exc)) @@ -124,10 +124,8 @@ async def check_length( assert (not (prompt is None and prompt_ids is None) and not (prompt is not None and prompt_ids is not None) ), "Either prompt or prompt_ids should be provided." - if prompt_ids is not None: - input_ids = prompt_ids - else: - input_ids = tokenizer(prompt).input_ids + input_ids = prompt_ids if prompt_ids is not None else tokenizer( + prompt).input_ids token_num = len(input_ids) if request.max_tokens is None: diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 2e774a1c7320f..2147b16b8a491 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -84,15 +84,14 @@ def get_act_fn( f"Activation function {act_fn_name!r} is not supported.") act_fn = _ACTIVATION_REGISTRY[act_fn_name] - if quant_config is not None: - if act_fn_name in quant_config.get_scaled_act_names(): - if intermediate_size is None: - raise ValueError( - "intermediate_size must be specified for scaled " - "activation functions.") - return ScaledActivation( - act_fn, - intermediate_size, - params_dtype=torch.get_default_dtype(), - ) + if quant_config is not None and act_fn_name in quant_config.get_scaled_act_names( + ): + if intermediate_size is None: + raise ValueError("intermediate_size must be specified for scaled " + "activation functions.") + return ScaledActivation( + act_fn, + intermediate_size, + params_dtype=torch.get_default_dtype(), + ) return act_fn diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index b94c82e132583..a0a1d36216498 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -18,7 +18,6 @@ _PARTITION_SIZE = 512 class PagedAttention(nn.Module): - # pylint: disable=line-too-long """GPT-style multi-head PagedAttention. This class takes query, key, and value tensors as input. The input tensors diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 44e572bdc12f5..0ab5819d930aa 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -50,7 +50,7 @@ class AWQConfig(QuantizationConfig): def get_config_filenames() -> List[str]: return [ "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq - "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq # pylint: disable=line-too-long + "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq ] @classmethod diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index 71a22c7771b2e..54b87c4b866e3 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -7,7 +7,7 @@ import torch.nn as nn from transformers import PretrainedConfig from vllm.config import ModelConfig -from vllm.model_executor.models import * # pylint: disable=wildcard-import +from vllm.model_executor.models import * from vllm.model_executor.weight_utils import (get_quant_config, initialize_dummy_weights) diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py index a1604bbba33b2..8372da562cf2e 100644 --- a/vllm/model_executor/models/aquila.py +++ b/vllm/model_executor/models/aquila.py @@ -261,10 +261,7 @@ class AquilaModel(nn.Module): ) -> torch.Tensor: hidden_states = self.embed_tokens(input_ids) for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states = layer( positions, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index f86bd241eef04..93cbc1a8516a7 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -281,10 +281,7 @@ class BaiChuanModel(nn.Module): hidden_states = self.embed_tokens(input_ids) residual = None for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states, residual = layer( positions, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 6a5f8c516f317..0eb3fdbb9ae3a 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -256,10 +256,7 @@ class BloomModel(nn.Module): hidden_states = self.word_embeddings(input_ids) hidden_states = self.word_embeddings_layernorm(hidden_states) for i in range(len(self.h)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.h[i] hidden_states = layer( position_ids, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 673ca2092146a..2a113a155aedd 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -269,10 +269,7 @@ class GLMTransformer(nn.Module): cache_events: Optional[List[torch.cuda.Event]], ) -> torch.Tensor: for i in range(self.num_layers): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states = layer( hidden_states=hidden_states, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index f1b5d1da3601a..ceb7c651823e0 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -353,10 +353,7 @@ class FalconModel(nn.Module): ) -> torch.Tensor: hidden_states = self.word_embeddings(input_ids) for i in range(len(self.h)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.h[i] hidden_states = layer( positions, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 1de3d85e233ff..0f9f74d32ae3c 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -206,10 +206,7 @@ class GPT2Model(nn.Module): hidden_states = inputs_embeds + position_embeds for i in range(len(self.h)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.h[i] hidden_states = layer(hidden_states, kv_caches[i], input_metadata, cache_event) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c2f9611c0fef2..47a5d7711e370 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -225,10 +225,7 @@ class GPTBigCodeModel(nn.Module): hidden_states = inputs_embeds + position_embeds for i in range(len(self.h)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.h[i] hidden_states = layer(hidden_states, kv_caches[i], input_metadata, cache_event) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index a5bb6f0fbefc5..9093d642a68fb 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -147,10 +147,7 @@ class GPTJBlock(nn.Module): linear_method: Optional[LinearMethodBase] = None, ): super().__init__() - if config.n_inner is None: - inner_dim = 4 * config.n_embd - else: - inner_dim = config.n_inner + inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = GPTJAttention(config, linear_method) self.mlp = GPTJMLP(inner_dim, config, linear_method) @@ -205,10 +202,7 @@ class GPTJModel(nn.Module): ) -> torch.Tensor: hidden_states = self.wte(input_ids) for i in range(len(self.h)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.h[i] hidden_states = layer( position_ids, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 97ac5ca243557..8c0667d88d953 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -216,10 +216,7 @@ class GPTNeoXModel(nn.Module): ) -> torch.Tensor: hidden_states = self.embed_in(input_ids) for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states = layer( position_ids, diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index 4621be732b897..13b2e70deeb86 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -213,10 +213,7 @@ class InternLMModel(nn.Module): hidden_states = self.embed_tokens(input_ids) residual = None for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states, residual = layer( positions, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2e02ef15fab28..c3192e8069703 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -253,10 +253,7 @@ class LlamaModel(nn.Module): hidden_states = self.embed_tokens(input_ids) residual = None for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states, residual = layer( positions, diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py index af3199af7844b..793e25b635978 100644 --- a/vllm/model_executor/models/mistral.py +++ b/vllm/model_executor/models/mistral.py @@ -248,10 +248,7 @@ class MistralModel(nn.Module): hidden_states = self.embed_tokens(input_ids) residual = None for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states, residual = layer( positions, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index c9cf16475ca21..47130649d3c6c 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -203,10 +203,10 @@ class MPTModel(nn.Module): self.norm_f = nn.LayerNorm(config.d_model) if config.no_bias: for module in self.modules(): - if hasattr(module, "bias"): - if isinstance(module.bias, nn.Parameter): - # Remove the bias term in Linear and LayerNorm. - module.register_parameter("bias", None) + if hasattr(module, "bias") and isinstance( + module.bias, nn.Parameter): + # Remove the bias term in Linear and LayerNorm. + module.register_parameter("bias", None) def forward( self, @@ -218,10 +218,7 @@ class MPTModel(nn.Module): ) -> torch.Tensor: hidden_states = self.wte(input_ids) for i in range(len(self.blocks)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] block = self.blocks[i] hidden_states = block( position_ids, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 2d1df29a59cf1..47b991e586029 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -257,10 +257,7 @@ class OPTDecoder(nn.Module): hidden_states = inputs_embeds + pos_embeds for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states = layer(hidden_states, kv_caches[i], input_metadata, cache_event) diff --git a/vllm/model_executor/models/phi_1_5.py b/vllm/model_executor/models/phi_1_5.py index fbf7aa0a1491e..18cd40f39a0af 100644 --- a/vllm/model_executor/models/phi_1_5.py +++ b/vllm/model_executor/models/phi_1_5.py @@ -258,10 +258,7 @@ class PhiModel(nn.Module): ) -> SamplerOutput: hidden_states = self.embd(input_ids) for i in range(self.config.num_hidden_layers): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.h[i] hidden_states = layer( positions, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 18e150368a357..ce13cae7ee002 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -213,10 +213,7 @@ class QWenModel(nn.Module): hidden_states = self.wte(input_ids) residual = None for i in range(len(self.h)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.h[i] hidden_states, residual = layer( positions, diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py index 91e073773157c..8faa106f202f5 100644 --- a/vllm/model_executor/models/yi.py +++ b/vllm/model_executor/models/yi.py @@ -249,10 +249,7 @@ class YiModel(nn.Module): hidden_states = self.embed_tokens(input_ids) residual = None for i in range(len(self.layers)): - if cache_events is None: - cache_event = None - else: - cache_event = cache_events[i] + cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] hidden_states, residual = layer( positions, diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index ddad964f817df..8c17d9c635c73 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -131,11 +131,9 @@ def prepare_hf_model_weights( ) -> Tuple[str, List[str], bool]: # Download model weights from huggingface. is_local = os.path.isdir(model_name_or_path) - if use_safetensors: - allow_patterns = ["*.safetensors"] - else: - # Some quantized models use .pt files for storing the weights. - allow_patterns = ["*.bin", "*.pt"] + # Some quantized models use .pt files for storing the weights. + allow_patterns = ["*.safetensors" + ] if use_safetensors else ["*.bin", "*.pt"] if not is_local: # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. @@ -242,7 +240,7 @@ def hf_model_weights_iterator( elif use_safetensors: for st_file in hf_weights_files: with safe_open(st_file, framework="pt") as f: - for name in f.keys(): + for name in f: param = f.get_tensor(name) yield name, param else: diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 28db703ec8b71..8b16e559b24f2 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -2,7 +2,7 @@ from typing import Optional from transformers import AutoConfig, PretrainedConfig -from vllm.transformers_utils.configs import * # pylint: disable=wildcard-import +from vllm.transformers_utils.configs import * _CONFIG_REGISTRY = { "aquila": AquilaConfig, diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 711251da42fd3..5ea0d9122ef11 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -62,7 +62,6 @@ class MPTConfig(PretrainedConfig): fc_type: str = 'torch', verbose: Optional[int] = None, **kwargs: Any): - # pylint: disable=line-too-long """The MPT configuration class. Args: d_model (int): The size of the embedding dimension of the model. @@ -139,10 +138,10 @@ class MPTConfig(PretrainedConfig): self.init_config = init_config self.fc_type = fc_type if verbose is not None: - warnings.warn( - DeprecationWarning( - 'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' - )) + warnings.warn(DeprecationWarning( + 'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' + ), + stacklevel=2) if 'name' in kwargs: del kwargs['name'] if 'loss_fn' in kwargs: @@ -150,8 +149,8 @@ class MPTConfig(PretrainedConfig): if self.attn_config.get('alibi', False): self.learned_pos_emb = False warnings.warn( - f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`' - ) + f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', + stacklevel=2) super().__init__(**kwargs) self._validate_config() @@ -211,7 +210,8 @@ class MPTConfig(PretrainedConfig): ) if not self.learned_pos_emb and (not self.attn_config['alibi']): warnings.warn( - 'Positional information not being provided to the model.') + 'Positional information not being provided to the model.', + stacklevel=2) if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp': try: # pylint: disable=import-outside-toplevel diff --git a/vllm/utils.py b/vllm/utils.py index 0e17e90704892..34d3084856af8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -30,7 +30,7 @@ class Counter: def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html - cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 # pylint: disable=invalid-name + cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 max_shared_mem = cuda_utils.get_device_attribute( cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu) return int(max_shared_mem) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index bbbc2e7f45a6e..4fcd179ce85f2 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -350,10 +350,7 @@ class Worker: self.cache_engine.copy(blocks_to_copy) issued_cache_op = True - if issued_cache_op: - cache_events = self.cache_events - else: - cache_events = None + cache_events = self.cache_events if issued_cache_op else None # If there is no input, we don't need to execute the model. if not seq_group_metadata_list: