mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 10:17:52 +08:00
Migrate linter from pylint to ruff (#1665)
This commit is contained in:
parent
112627e8b2
commit
5ffc0d13a2
@ -1,4 +1,4 @@
|
|||||||
name: pylint
|
name: ruff
|
||||||
|
|
||||||
on:
|
on:
|
||||||
# Trigger the workflow on push or pull request,
|
# Trigger the workflow on push or pull request,
|
||||||
@ -11,7 +11,7 @@ on:
|
|||||||
- main
|
- main
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pylint:
|
ruff:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@ -25,7 +25,7 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install pylint==2.8.2
|
pip install ruff==0.1.5
|
||||||
- name: Analysing the code with pylint
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
pylint vllm tests
|
ruff vllm tests
|
||||||
434
.pylintrc
434
.pylintrc
@ -1,434 +0,0 @@
|
|||||||
# This Pylint rcfile contains a best-effort configuration to uphold the
|
|
||||||
# best-practices and style described in the Google Python style guide:
|
|
||||||
# https://google.github.io/styleguide/pyguide.html
|
|
||||||
#
|
|
||||||
# Its canonical open-source location is:
|
|
||||||
# https://google.github.io/styleguide/pylintrc
|
|
||||||
|
|
||||||
[MASTER]
|
|
||||||
|
|
||||||
# Files or directories to be skipped. They should be base names, not paths.
|
|
||||||
ignore=docs
|
|
||||||
|
|
||||||
# Files or directories matching the regex patterns are skipped. The regex
|
|
||||||
# matches against base names, not paths.
|
|
||||||
ignore-patterns=
|
|
||||||
|
|
||||||
# Pickle collected data for later comparisons.
|
|
||||||
persistent=no
|
|
||||||
|
|
||||||
# List of plugins (as comma separated values of python modules names) to load,
|
|
||||||
# usually to register additional checkers.
|
|
||||||
load-plugins=
|
|
||||||
|
|
||||||
# Use multiple processes to speed up Pylint.
|
|
||||||
jobs=4
|
|
||||||
|
|
||||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
|
||||||
# active Python interpreter and may run arbitrary code.
|
|
||||||
unsafe-load-any-extension=no
|
|
||||||
|
|
||||||
|
|
||||||
[MESSAGES CONTROL]
|
|
||||||
|
|
||||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
|
||||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
|
|
||||||
confidence=
|
|
||||||
|
|
||||||
# Enable the message, report, category or checker with the given id(s). You can
|
|
||||||
# either give multiple identifier separated by comma (,) or put this option
|
|
||||||
# multiple time (only on the command line, not in the configuration file where
|
|
||||||
# it should appear only once). See also the "--disable" option for examples.
|
|
||||||
#enable=
|
|
||||||
|
|
||||||
# Disable the message, report, category or checker with the given id(s). You
|
|
||||||
# can either give multiple identifiers separated by comma (,) or put this
|
|
||||||
# option multiple times (only on the command line, not in the configuration
|
|
||||||
# file where it should appear only once).You can also use "--disable=all" to
|
|
||||||
# disable everything first and then reenable specific checks. For example, if
|
|
||||||
# you want to run only the similarities checker, you can use "--disable=all
|
|
||||||
# --enable=similarities". If you want to run only the classes checker, but have
|
|
||||||
# no Warning level messages displayed, use"--disable=all --enable=classes
|
|
||||||
# --disable=W"
|
|
||||||
disable=abstract-method,
|
|
||||||
apply-builtin,
|
|
||||||
arguments-differ,
|
|
||||||
attribute-defined-outside-init,
|
|
||||||
backtick,
|
|
||||||
bad-option-value,
|
|
||||||
basestring-builtin,
|
|
||||||
buffer-builtin,
|
|
||||||
c-extension-no-member,
|
|
||||||
consider-using-enumerate,
|
|
||||||
cmp-builtin,
|
|
||||||
cmp-method,
|
|
||||||
coerce-builtin,
|
|
||||||
coerce-method,
|
|
||||||
delslice-method,
|
|
||||||
div-method,
|
|
||||||
duplicate-code,
|
|
||||||
eq-without-hash,
|
|
||||||
execfile-builtin,
|
|
||||||
file-builtin,
|
|
||||||
filter-builtin-not-iterating,
|
|
||||||
fixme,
|
|
||||||
getslice-method,
|
|
||||||
global-statement,
|
|
||||||
hex-method,
|
|
||||||
idiv-method,
|
|
||||||
implicit-str-concat-in-sequence,
|
|
||||||
import-error,
|
|
||||||
import-self,
|
|
||||||
import-star-module-level,
|
|
||||||
inconsistent-return-statements,
|
|
||||||
input-builtin,
|
|
||||||
intern-builtin,
|
|
||||||
invalid-str-codec,
|
|
||||||
locally-disabled,
|
|
||||||
logging-fstring-interpolation, # added by vLLM
|
|
||||||
logging-not-lazy, # added by vLLM
|
|
||||||
long-builtin,
|
|
||||||
long-suffix,
|
|
||||||
map-builtin-not-iterating,
|
|
||||||
misplaced-comparison-constant,
|
|
||||||
missing-class-docstring, # TODO (vLLM): enable
|
|
||||||
missing-function-docstring,
|
|
||||||
missing-module-docstring, # TODO (vLLM): enable
|
|
||||||
metaclass-assignment,
|
|
||||||
next-method-called,
|
|
||||||
next-method-defined,
|
|
||||||
no-absolute-import,
|
|
||||||
no-else-break,
|
|
||||||
no-else-continue,
|
|
||||||
no-else-raise,
|
|
||||||
no-else-return,
|
|
||||||
no-init, # added
|
|
||||||
no-member,
|
|
||||||
no-name-in-module,
|
|
||||||
no-self-use,
|
|
||||||
nonzero-method,
|
|
||||||
oct-method,
|
|
||||||
old-division,
|
|
||||||
old-ne-operator,
|
|
||||||
old-octal-literal,
|
|
||||||
old-raise-syntax,
|
|
||||||
parameter-unpacking,
|
|
||||||
print-statement,
|
|
||||||
raising-string,
|
|
||||||
range-builtin-not-iterating,
|
|
||||||
raw_input-builtin,
|
|
||||||
rdiv-method,
|
|
||||||
reduce-builtin,
|
|
||||||
relative-import,
|
|
||||||
reload-builtin,
|
|
||||||
round-builtin,
|
|
||||||
setslice-method,
|
|
||||||
signature-differs,
|
|
||||||
standarderror-builtin,
|
|
||||||
suppressed-message,
|
|
||||||
sys-max-int,
|
|
||||||
too-few-public-methods,
|
|
||||||
too-many-ancestors,
|
|
||||||
too-many-arguments,
|
|
||||||
too-many-boolean-expressions,
|
|
||||||
too-many-branches,
|
|
||||||
too-many-instance-attributes,
|
|
||||||
too-many-locals,
|
|
||||||
too-many-nested-blocks,
|
|
||||||
too-many-public-methods,
|
|
||||||
too-many-return-statements,
|
|
||||||
too-many-statements,
|
|
||||||
trailing-newlines,
|
|
||||||
unichr-builtin,
|
|
||||||
unicode-builtin,
|
|
||||||
unnecessary-pass,
|
|
||||||
unpacking-in-except,
|
|
||||||
unspecified-encoding,
|
|
||||||
useless-else-on-loop,
|
|
||||||
useless-object-inheritance,
|
|
||||||
useless-suppression,
|
|
||||||
using-cmp-argument,
|
|
||||||
wrong-import-order,
|
|
||||||
xrange-builtin,
|
|
||||||
zip-builtin-not-iterating,
|
|
||||||
|
|
||||||
|
|
||||||
[REPORTS]
|
|
||||||
|
|
||||||
# Set the output format. Available formats are text, parseable, colorized, msvs
|
|
||||||
# (visual studio) and html. You can also give a reporter class, eg
|
|
||||||
# mypackage.mymodule.MyReporterClass.
|
|
||||||
output-format=text
|
|
||||||
|
|
||||||
# Tells whether to display a full report or only the messages
|
|
||||||
reports=no
|
|
||||||
|
|
||||||
# Python expression which should return a note less than 10 (10 is the highest
|
|
||||||
# note). You have access to the variables errors warning, statement which
|
|
||||||
# respectively contain the number of errors / warnings messages and the total
|
|
||||||
# number of statements analyzed. This is used by the global evaluation report
|
|
||||||
# (RP0004).
|
|
||||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
|
||||||
|
|
||||||
# Template used to display messages. This is a python new-style format string
|
|
||||||
# used to format the message information. See doc for all details
|
|
||||||
#msg-template=
|
|
||||||
|
|
||||||
|
|
||||||
[BASIC]
|
|
||||||
|
|
||||||
# Good variable names which should always be accepted, separated by a comma
|
|
||||||
good-names=main,_
|
|
||||||
|
|
||||||
# Bad variable names which should always be refused, separated by a comma
|
|
||||||
bad-names=
|
|
||||||
|
|
||||||
# Colon-delimited sets of names that determine each other's naming style when
|
|
||||||
# the name regexes allow several styles.
|
|
||||||
name-group=
|
|
||||||
|
|
||||||
# Include a hint for the correct naming format with invalid-name
|
|
||||||
include-naming-hint=no
|
|
||||||
|
|
||||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
|
||||||
# to this list to register other decorators that produce valid properties.
|
|
||||||
property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
|
|
||||||
|
|
||||||
# Regular expression matching correct function names
|
|
||||||
function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
|
|
||||||
|
|
||||||
# Regular expression matching correct variable names
|
|
||||||
variable-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct constant names
|
|
||||||
const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
|
||||||
|
|
||||||
# Regular expression matching correct attribute names
|
|
||||||
attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct argument names
|
|
||||||
argument-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct class attribute names
|
|
||||||
class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
|
||||||
|
|
||||||
# Regular expression matching correct inline iteration names
|
|
||||||
inlinevar-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct class names
|
|
||||||
class-rgx=^_?[A-Z][a-zA-Z0-9]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct module names
|
|
||||||
module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
|
|
||||||
|
|
||||||
# Regular expression matching correct method names
|
|
||||||
method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
|
|
||||||
|
|
||||||
# Regular expression which should only match function or class names that do
|
|
||||||
# not require a docstring.
|
|
||||||
no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
|
|
||||||
|
|
||||||
# Minimum line length for functions/classes that require docstrings, shorter
|
|
||||||
# ones are exempt.
|
|
||||||
docstring-min-length=10
|
|
||||||
|
|
||||||
|
|
||||||
[TYPECHECK]
|
|
||||||
|
|
||||||
# List of decorators that produce context managers, such as
|
|
||||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
|
||||||
# produce valid context managers.
|
|
||||||
contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
|
|
||||||
|
|
||||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
|
||||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
|
||||||
ignore-mixin-members=yes
|
|
||||||
|
|
||||||
# List of module names for which member attributes should not be checked
|
|
||||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
|
||||||
# and thus existing member attributes cannot be deduced by static analysis. It
|
|
||||||
# supports qualified module names, as well as Unix pattern matching.
|
|
||||||
ignored-modules=
|
|
||||||
|
|
||||||
# List of class names for which member attributes should not be checked (useful
|
|
||||||
# for classes with dynamically set attributes). This supports the use of
|
|
||||||
# qualified names.
|
|
||||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
|
||||||
|
|
||||||
# List of members which are set dynamically and missed by pylint inference
|
|
||||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
|
||||||
# expressions are accepted.
|
|
||||||
generated-members=
|
|
||||||
|
|
||||||
|
|
||||||
[FORMAT]
|
|
||||||
|
|
||||||
# Maximum number of characters on a single line.
|
|
||||||
max-line-length=80
|
|
||||||
|
|
||||||
# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
|
|
||||||
# lines made too long by directives to pytype.
|
|
||||||
|
|
||||||
# Regexp for a line that is allowed to be longer than the limit.
|
|
||||||
ignore-long-lines=(?x)(
|
|
||||||
^\s*(\#\ )?<?https?://\S+>?$|
|
|
||||||
^\s*(from\s+\S+\s+)?import\s+.+$)
|
|
||||||
|
|
||||||
# Allow the body of an if to be on the same line as the test if there is no
|
|
||||||
# else.
|
|
||||||
single-line-if-stmt=yes
|
|
||||||
|
|
||||||
# Maximum number of lines in a module
|
|
||||||
max-module-lines=99999
|
|
||||||
|
|
||||||
# String used as indentation unit. The internal Google style guide mandates 2
|
|
||||||
# spaces. Google's externaly-published style guide says 4, consistent with
|
|
||||||
# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google
|
|
||||||
# projects (like TensorFlow).
|
|
||||||
indent-string=' '
|
|
||||||
|
|
||||||
# Number of spaces of indent required inside a hanging or continued line.
|
|
||||||
indent-after-paren=4
|
|
||||||
|
|
||||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
|
||||||
expected-line-ending-format=
|
|
||||||
|
|
||||||
|
|
||||||
[MISCELLANEOUS]
|
|
||||||
|
|
||||||
# List of note tags to take in consideration, separated by a comma.
|
|
||||||
notes=TODO
|
|
||||||
|
|
||||||
|
|
||||||
[STRING]
|
|
||||||
|
|
||||||
# This flag controls whether inconsistent-quotes generates a warning when the
|
|
||||||
# character used as a quote delimiter is used inconsistently within a module.
|
|
||||||
check-quote-consistency=yes
|
|
||||||
|
|
||||||
|
|
||||||
[VARIABLES]
|
|
||||||
|
|
||||||
# Tells whether we should check for unused import in __init__ files.
|
|
||||||
init-import=no
|
|
||||||
|
|
||||||
# A regular expression matching the name of dummy variables (i.e. expectedly
|
|
||||||
# not used).
|
|
||||||
dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
|
|
||||||
|
|
||||||
# List of additional names supposed to be defined in builtins. Remember that
|
|
||||||
# you should avoid to define new builtins when possible.
|
|
||||||
additional-builtins=
|
|
||||||
|
|
||||||
# List of strings which can identify a callback function by name. A callback
|
|
||||||
# name must start or end with one of those strings.
|
|
||||||
callbacks=cb_,_cb
|
|
||||||
|
|
||||||
# List of qualified module names which can have objects that can redefine
|
|
||||||
# builtins.
|
|
||||||
redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
|
|
||||||
|
|
||||||
|
|
||||||
[LOGGING]
|
|
||||||
|
|
||||||
# Logging modules to check that the string format arguments are in logging
|
|
||||||
# function parameter format
|
|
||||||
logging-modules=logging,absl.logging,tensorflow.io.logging
|
|
||||||
|
|
||||||
|
|
||||||
[SIMILARITIES]
|
|
||||||
|
|
||||||
# Minimum lines number of a similarity.
|
|
||||||
min-similarity-lines=4
|
|
||||||
|
|
||||||
# Ignore comments when computing similarities.
|
|
||||||
ignore-comments=yes
|
|
||||||
|
|
||||||
# Ignore docstrings when computing similarities.
|
|
||||||
ignore-docstrings=yes
|
|
||||||
|
|
||||||
# Ignore imports when computing similarities.
|
|
||||||
ignore-imports=no
|
|
||||||
|
|
||||||
|
|
||||||
[SPELLING]
|
|
||||||
|
|
||||||
# Spelling dictionary name. Available dictionaries: none. To make it working
|
|
||||||
# install python-enchant package.
|
|
||||||
spelling-dict=
|
|
||||||
|
|
||||||
# List of comma separated words that should not be checked.
|
|
||||||
spelling-ignore-words=
|
|
||||||
|
|
||||||
# A path to a file that contains private dictionary; one word per line.
|
|
||||||
spelling-private-dict-file=
|
|
||||||
|
|
||||||
# Tells whether to store unknown words to indicated private dictionary in
|
|
||||||
# --spelling-private-dict-file option instead of raising a message.
|
|
||||||
spelling-store-unknown-words=no
|
|
||||||
|
|
||||||
|
|
||||||
[IMPORTS]
|
|
||||||
|
|
||||||
# Deprecated modules which should not be used, separated by a comma
|
|
||||||
deprecated-modules=regsub,
|
|
||||||
TERMIOS,
|
|
||||||
Bastion,
|
|
||||||
rexec,
|
|
||||||
sets
|
|
||||||
|
|
||||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
|
||||||
# given file (report RP0402 must not be disabled)
|
|
||||||
import-graph=
|
|
||||||
|
|
||||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled)
|
|
||||||
ext-import-graph=
|
|
||||||
|
|
||||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled)
|
|
||||||
int-import-graph=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of the standard
|
|
||||||
# compatibility libraries.
|
|
||||||
known-standard-library=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of a third party library.
|
|
||||||
known-third-party=enchant, absl
|
|
||||||
|
|
||||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
|
||||||
# 3 compatible code, which means that the block might have code that exists
|
|
||||||
# only in one or another interpreter, leading to false positives when analysed.
|
|
||||||
analyse-fallback-blocks=no
|
|
||||||
|
|
||||||
|
|
||||||
[CLASSES]
|
|
||||||
|
|
||||||
# List of method names used to declare (i.e. assign) instance attributes.
|
|
||||||
defining-attr-methods=__init__,
|
|
||||||
__new__,
|
|
||||||
setUp
|
|
||||||
|
|
||||||
# List of member names, which should be excluded from the protected access
|
|
||||||
# warning.
|
|
||||||
exclude-protected=_asdict,
|
|
||||||
_fields,
|
|
||||||
_replace,
|
|
||||||
_source,
|
|
||||||
_make
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a class method.
|
|
||||||
valid-classmethod-first-arg=cls,
|
|
||||||
class_
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a metaclass class method.
|
|
||||||
valid-metaclass-classmethod-first-arg=mcs
|
|
||||||
|
|
||||||
|
|
||||||
[EXCEPTIONS]
|
|
||||||
|
|
||||||
# Exceptions that will emit a warning when being caught. Defaults to
|
|
||||||
# "Exception"
|
|
||||||
overgeneral-exceptions=StandardError,
|
|
||||||
Exception,
|
|
||||||
BaseException
|
|
||||||
@ -17,9 +17,8 @@ def sample_requests(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
) -> List[Tuple[str, int, int]]:
|
) -> List[Tuple[str, int, int]]:
|
||||||
if fixed_output_len is not None:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
if fixed_output_len < 4:
|
raise ValueError("output_len too small")
|
||||||
raise ValueError("output_len too small")
|
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path) as f:
|
||||||
|
|||||||
16
format.sh
16
format.sh
@ -7,7 +7,7 @@
|
|||||||
# # Format files that differ from origin/main.
|
# # Format files that differ from origin/main.
|
||||||
# bash format.sh
|
# bash format.sh
|
||||||
|
|
||||||
# # Commit changed files with message 'Run yapf and pylint'
|
# # Commit changed files with message 'Run yapf and ruff'
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
|
# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
|
||||||
@ -22,7 +22,7 @@ ROOT="$(git rev-parse --show-toplevel)"
|
|||||||
builtin cd "$ROOT" || exit 1
|
builtin cd "$ROOT" || exit 1
|
||||||
|
|
||||||
YAPF_VERSION=$(yapf --version | awk '{print $2}')
|
YAPF_VERSION=$(yapf --version | awk '{print $2}')
|
||||||
PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
|
RUFF_VERSION=$(ruff --version | awk '{print $2}')
|
||||||
MYPY_VERSION=$(mypy --version | awk '{print $2}')
|
MYPY_VERSION=$(mypy --version | awk '{print $2}')
|
||||||
|
|
||||||
# # params: tool name, tool version, required version
|
# # params: tool name, tool version, required version
|
||||||
@ -34,7 +34,7 @@ tool_version_check() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
|
||||||
tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
|
||||||
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
|
||||||
|
|
||||||
YAPF_FLAGS=(
|
YAPF_FLAGS=(
|
||||||
@ -95,14 +95,14 @@ echo 'vLLM yapf: Done'
|
|||||||
|
|
||||||
# Lint specified files
|
# Lint specified files
|
||||||
lint() {
|
lint() {
|
||||||
pylint "$@"
|
ruff "$@"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Lint files that differ from main branch. Ignores dirs that are not slated
|
# Lint files that differ from main branch. Ignores dirs that are not slated
|
||||||
# for autolint yet.
|
# for autolint yet.
|
||||||
lint_changed() {
|
lint_changed() {
|
||||||
# The `if` guard ensures that the list of filenames is not empty, which
|
# The `if` guard ensures that the list of filenames is not empty, which
|
||||||
# could cause pylint to receive 0 positional arguments, making it hang
|
# could cause ruff to receive 0 positional arguments, making it hang
|
||||||
# waiting for STDIN.
|
# waiting for STDIN.
|
||||||
#
|
#
|
||||||
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
|
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
|
||||||
@ -111,13 +111,13 @@ lint_changed() {
|
|||||||
|
|
||||||
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
|
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
|
||||||
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
|
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
|
||||||
pylint
|
ruff
|
||||||
fi
|
fi
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Run Pylint
|
# Run Ruff
|
||||||
echo 'vLLM Pylint:'
|
echo 'vLLM Ruff:'
|
||||||
## This flag lints individual files. --files *must* be the first command line
|
## This flag lints individual files. --files *must* be the first command line
|
||||||
## arg to use this option.
|
## arg to use this option.
|
||||||
if [[ "$1" == '--files' ]]; then
|
if [[ "$1" == '--files' ]]; then
|
||||||
|
|||||||
@ -7,3 +7,27 @@ requires = [
|
|||||||
"wheel",
|
"wheel",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
# "UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
# "I",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# line too long, handled by black formatting
|
||||||
|
"E501",
|
||||||
|
]
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# formatting
|
# formatting
|
||||||
yapf==0.32.0
|
yapf==0.32.0
|
||||||
pylint==2.8.2
|
ruff==0.1.5
|
||||||
|
|
||||||
# type checking
|
# type checking
|
||||||
mypy==0.991
|
mypy==0.991
|
||||||
|
|||||||
14
setup.py
14
setup.py
@ -75,7 +75,8 @@ def get_torch_arch_list() -> Set[str]:
|
|||||||
f"Unsupported CUDA architectures ({invalid_arch_list}) are "
|
f"Unsupported CUDA architectures ({invalid_arch_list}) are "
|
||||||
"excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
|
"excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
|
||||||
f"({env_arch_list}). Supported CUDA architectures are: "
|
f"({env_arch_list}). Supported CUDA architectures are: "
|
||||||
f"{valid_archs}.")
|
f"{valid_archs}.",
|
||||||
|
stacklevel=2)
|
||||||
return arch_list
|
return arch_list
|
||||||
|
|
||||||
|
|
||||||
@ -106,10 +107,10 @@ if not compute_capabilities:
|
|||||||
# Validate the NVCC CUDA version.
|
# Validate the NVCC CUDA version.
|
||||||
if nvcc_cuda_version < Version("11.0"):
|
if nvcc_cuda_version < Version("11.0"):
|
||||||
raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
|
raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
|
||||||
if nvcc_cuda_version < Version("11.1"):
|
if (nvcc_cuda_version < Version("11.1")
|
||||||
if any(cc.startswith("8.6") for cc in compute_capabilities):
|
and any(cc.startswith("8.6") for cc in compute_capabilities)):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"CUDA 11.1 or higher is required for compute capability 8.6.")
|
"CUDA 11.1 or higher is required for compute capability 8.6.")
|
||||||
if nvcc_cuda_version < Version("11.8"):
|
if nvcc_cuda_version < Version("11.8"):
|
||||||
if any(cc.startswith("8.9") for cc in compute_capabilities):
|
if any(cc.startswith("8.9") for cc in compute_capabilities):
|
||||||
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
|
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
|
||||||
@ -119,7 +120,8 @@ if nvcc_cuda_version < Version("11.8"):
|
|||||||
# instead of 8.9.
|
# instead of 8.9.
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"CUDA 11.8 or higher is required for compute capability 8.9. "
|
"CUDA 11.8 or higher is required for compute capability 8.9. "
|
||||||
"Targeting compute capability 8.0 instead.")
|
"Targeting compute capability 8.0 instead.",
|
||||||
|
stacklevel=2)
|
||||||
compute_capabilities = set(cc for cc in compute_capabilities
|
compute_capabilities = set(cc for cc in compute_capabilities
|
||||||
if not cc.startswith("8.9"))
|
if not cc.startswith("8.9"))
|
||||||
compute_capabilities.add("8.0+PTX")
|
compute_capabilities.add("8.0+PTX")
|
||||||
|
|||||||
@ -14,7 +14,6 @@ app = vllm.entrypoints.api_server.app
|
|||||||
|
|
||||||
class AsyncLLMEngineWithStats(AsyncLLMEngine):
|
class AsyncLLMEngineWithStats(AsyncLLMEngine):
|
||||||
|
|
||||||
# pylint: disable=redefined-outer-name
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self._num_aborts = 0
|
self._num_aborts = 0
|
||||||
|
|||||||
@ -24,7 +24,6 @@ def _query_server(prompt: str) -> dict:
|
|||||||
def api_server():
|
def api_server():
|
||||||
script_path = Path(__file__).parent.joinpath(
|
script_path = Path(__file__).parent.joinpath(
|
||||||
"api_server_async_engine.py").absolute()
|
"api_server_async_engine.py").absolute()
|
||||||
# pylint: disable=consider-using-with
|
|
||||||
uvicorn_process = subprocess.Popen([
|
uvicorn_process = subprocess.Popen([
|
||||||
sys.executable, "-u",
|
sys.executable, "-u",
|
||||||
str(script_path), "--model", "facebook/opt-125m"
|
str(script_path), "--model", "facebook/opt-125m"
|
||||||
@ -33,7 +32,6 @@ def api_server():
|
|||||||
uvicorn_process.terminate()
|
uvicorn_process.terminate()
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=redefined-outer-name, unused-argument
|
|
||||||
def test_api_server(api_server):
|
def test_api_server(api_server):
|
||||||
"""
|
"""
|
||||||
Run the API server and test it.
|
Run the API server and test it.
|
||||||
@ -49,11 +47,10 @@ def test_api_server(api_server):
|
|||||||
prompts = ["Hello world"] * 1
|
prompts = ["Hello world"] * 1
|
||||||
result = None
|
result = None
|
||||||
while not result:
|
while not result:
|
||||||
# pylint: disable=bare-except
|
|
||||||
try:
|
try:
|
||||||
for result in pool.map(_query_server, prompts):
|
for _ in pool.map(_query_server, prompts):
|
||||||
break
|
break
|
||||||
except:
|
except Exception:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
# Actual tests start here
|
# Actual tests start here
|
||||||
|
|||||||
@ -8,7 +8,6 @@ from vllm import LLM, SamplingParams
|
|||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
_TEST_PROMPTS = [
|
_TEST_PROMPTS = [
|
||||||
# pylint: disable=line-too-long
|
|
||||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
|
||||||
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
|
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
|
||||||
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
|
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
|
||||||
|
|||||||
@ -5,10 +5,9 @@ from transformers import AutoTokenizer
|
|||||||
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
||||||
|
|
||||||
TRUTH = [
|
TRUTH = [
|
||||||
# pylint: disable=line-too-long
|
"Hello here, this is a simple test", # noqa: E501
|
||||||
"Hello here, this is a simple test",
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa: E501
|
||||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",
|
"我很感谢你的热情" # noqa: E501
|
||||||
"我很感谢你的热情"
|
|
||||||
]
|
]
|
||||||
TOKENIZERS = [
|
TOKENIZERS = [
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
|
|||||||
@ -211,7 +211,7 @@ def test_paged_attention(
|
|||||||
alibi_slopes,
|
alibi_slopes,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
assert False, f"Unknown version: {version}"
|
raise AssertionError(f"Unknown version: {version}")
|
||||||
|
|
||||||
# Run the reference implementation.
|
# Run the reference implementation.
|
||||||
ref_output = torch.empty_like(query)
|
ref_output = torch.empty_like(query)
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# pylint: disable=protected-access
|
|
||||||
import random
|
import random
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
@ -20,10 +19,10 @@ class MockLogitsSampler(Sampler):
|
|||||||
|
|
||||||
def forward(self, *args, **kwargs):
|
def forward(self, *args, **kwargs):
|
||||||
with patch("vllm.model_executor.layers.sampler._prune_hidden_states",
|
with patch("vllm.model_executor.layers.sampler._prune_hidden_states",
|
||||||
lambda x, y: x):
|
lambda x, y: x), patch(
|
||||||
with patch("vllm.model_executor.layers.sampler._get_logits",
|
"vllm.model_executor.layers.sampler._get_logits",
|
||||||
lambda *args, **kwargs: self.fake_logits):
|
lambda *args, **kwargs: self.fake_logits):
|
||||||
return super().forward(*args, **kwargs)
|
return super().forward(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def _prepare_test(
|
def _prepare_test(
|
||||||
@ -214,6 +213,6 @@ def test_sampler_logits_processors(seed: int):
|
|||||||
sampler_output = sampler(embedding=None,
|
sampler_output = sampler(embedding=None,
|
||||||
hidden_states=input_tensor,
|
hidden_states=input_tensor,
|
||||||
input_metadata=input_metadata)
|
input_metadata=input_metadata)
|
||||||
for i, sequence_output in enumerate(sampler_output):
|
for _, sequence_output in enumerate(sampler_output):
|
||||||
for idx, nth_output in enumerate(sequence_output.samples):
|
for idx, nth_output in enumerate(sequence_output.samples):
|
||||||
assert nth_output.output_token == idx
|
assert nth_output.output_token == idx
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# pylint: disable=protected-access
|
|
||||||
import random
|
import random
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|||||||
@ -350,7 +350,7 @@ class Scheduler:
|
|||||||
elif preemption_mode == PreemptionMode.SWAP:
|
elif preemption_mode == PreemptionMode.SWAP:
|
||||||
self._preempt_by_swap(seq_group, blocks_to_swap_out)
|
self._preempt_by_swap(seq_group, blocks_to_swap_out)
|
||||||
else:
|
else:
|
||||||
assert False, "Invalid preemption mode."
|
raise AssertionError("Invalid preemption mode.")
|
||||||
|
|
||||||
def _preempt_by_recompute(
|
def _preempt_by_recompute(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -125,7 +125,7 @@ class LLMEngine:
|
|||||||
def _init_workers(self, distributed_init_method: str):
|
def _init_workers(self, distributed_init_method: str):
|
||||||
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
||||||
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
||||||
from vllm.worker.worker import Worker # pylint: disable=import-outside-toplevel
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
assert self.parallel_config.world_size == 1, (
|
assert self.parallel_config.world_size == 1, (
|
||||||
"Ray is required if parallel_config.world_size > 1.")
|
"Ray is required if parallel_config.world_size > 1.")
|
||||||
@ -148,7 +148,7 @@ class LLMEngine:
|
|||||||
**ray_remote_kwargs):
|
**ray_remote_kwargs):
|
||||||
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
||||||
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
||||||
from vllm.worker.worker import Worker # pylint: disable=import-outside-toplevel
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
self.workers: List[Worker] = []
|
self.workers: List[Worker] = []
|
||||||
for bundle in placement_group.bundle_specs:
|
for bundle in placement_group.bundle_specs:
|
||||||
|
|||||||
@ -16,7 +16,6 @@ try:
|
|||||||
|
|
||||||
def __init__(self, init_cached_hf_modules=False) -> None:
|
def __init__(self, init_cached_hf_modules=False) -> None:
|
||||||
if init_cached_hf_modules:
|
if init_cached_hf_modules:
|
||||||
# pylint: disable=import-outside-toplevel
|
|
||||||
from transformers.dynamic_module_utils import init_hf_modules
|
from transformers.dynamic_module_utils import init_hf_modules
|
||||||
init_hf_modules()
|
init_hf_modules()
|
||||||
self.worker = None
|
self.worker = None
|
||||||
@ -37,7 +36,7 @@ except ImportError as e:
|
|||||||
"`pip install ray pandas pyarrow`.")
|
"`pip install ray pandas pyarrow`.")
|
||||||
ray = None
|
ray = None
|
||||||
TorchDistributedWorker = None
|
TorchDistributedWorker = None
|
||||||
RayWorker = None # pylint: disable=invalid-name
|
RayWorker = None
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ray.util.placement_group import PlacementGroup
|
from ray.util.placement_group import PlacementGroup
|
||||||
|
|||||||
@ -134,25 +134,21 @@ class LLM:
|
|||||||
if isinstance(prompts, str):
|
if isinstance(prompts, str):
|
||||||
# Convert a single prompt to a list.
|
# Convert a single prompt to a list.
|
||||||
prompts = [prompts]
|
prompts = [prompts]
|
||||||
if prompts is not None and prompt_token_ids is not None:
|
if (prompts is not None and prompt_token_ids is not None
|
||||||
if len(prompts) != len(prompt_token_ids):
|
and len(prompts) != len(prompt_token_ids)):
|
||||||
raise ValueError("The lengths of prompts and prompt_token_ids "
|
raise ValueError("The lengths of prompts and prompt_token_ids "
|
||||||
"must be the same.")
|
"must be the same.")
|
||||||
if sampling_params is None:
|
if sampling_params is None:
|
||||||
# Use default sampling params.
|
# Use default sampling params.
|
||||||
sampling_params = SamplingParams()
|
sampling_params = SamplingParams()
|
||||||
|
|
||||||
# Add requests to the engine.
|
# Add requests to the engine.
|
||||||
if prompts is not None:
|
num_requests = len(prompts) if prompts is not None else len(
|
||||||
num_requests = len(prompts)
|
prompt_token_ids)
|
||||||
else:
|
|
||||||
num_requests = len(prompt_token_ids)
|
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
prompt = prompts[i] if prompts is not None else None
|
prompt = prompts[i] if prompts is not None else None
|
||||||
if prompt_token_ids is None:
|
token_ids = None if prompt_token_ids is None else prompt_token_ids[
|
||||||
token_ids = None
|
i]
|
||||||
else:
|
|
||||||
token_ids = prompt_token_ids[i]
|
|
||||||
self._add_request(prompt, sampling_params, token_ids)
|
self._add_request(prompt, sampling_params, token_ids)
|
||||||
return self._run_engine(use_tqdm)
|
return self._run_engine(use_tqdm)
|
||||||
|
|
||||||
|
|||||||
@ -55,7 +55,7 @@ def create_error_response(status_code: HTTPStatus,
|
|||||||
|
|
||||||
|
|
||||||
@app.exception_handler(RequestValidationError)
|
@app.exception_handler(RequestValidationError)
|
||||||
async def validation_exception_handler(request, exc): # pylint: disable=unused-argument
|
async def validation_exception_handler(_, exc):
|
||||||
return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
|
return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
|
||||||
|
|
||||||
|
|
||||||
@ -124,10 +124,8 @@ async def check_length(
|
|||||||
assert (not (prompt is None and prompt_ids is None)
|
assert (not (prompt is None and prompt_ids is None)
|
||||||
and not (prompt is not None and prompt_ids is not None)
|
and not (prompt is not None and prompt_ids is not None)
|
||||||
), "Either prompt or prompt_ids should be provided."
|
), "Either prompt or prompt_ids should be provided."
|
||||||
if prompt_ids is not None:
|
input_ids = prompt_ids if prompt_ids is not None else tokenizer(
|
||||||
input_ids = prompt_ids
|
prompt).input_ids
|
||||||
else:
|
|
||||||
input_ids = tokenizer(prompt).input_ids
|
|
||||||
token_num = len(input_ids)
|
token_num = len(input_ids)
|
||||||
|
|
||||||
if request.max_tokens is None:
|
if request.max_tokens is None:
|
||||||
|
|||||||
@ -84,15 +84,14 @@ def get_act_fn(
|
|||||||
f"Activation function {act_fn_name!r} is not supported.")
|
f"Activation function {act_fn_name!r} is not supported.")
|
||||||
|
|
||||||
act_fn = _ACTIVATION_REGISTRY[act_fn_name]
|
act_fn = _ACTIVATION_REGISTRY[act_fn_name]
|
||||||
if quant_config is not None:
|
if quant_config is not None and act_fn_name in quant_config.get_scaled_act_names(
|
||||||
if act_fn_name in quant_config.get_scaled_act_names():
|
):
|
||||||
if intermediate_size is None:
|
if intermediate_size is None:
|
||||||
raise ValueError(
|
raise ValueError("intermediate_size must be specified for scaled "
|
||||||
"intermediate_size must be specified for scaled "
|
"activation functions.")
|
||||||
"activation functions.")
|
return ScaledActivation(
|
||||||
return ScaledActivation(
|
act_fn,
|
||||||
act_fn,
|
intermediate_size,
|
||||||
intermediate_size,
|
params_dtype=torch.get_default_dtype(),
|
||||||
params_dtype=torch.get_default_dtype(),
|
)
|
||||||
)
|
|
||||||
return act_fn
|
return act_fn
|
||||||
|
|||||||
@ -18,7 +18,6 @@ _PARTITION_SIZE = 512
|
|||||||
|
|
||||||
|
|
||||||
class PagedAttention(nn.Module):
|
class PagedAttention(nn.Module):
|
||||||
# pylint: disable=line-too-long
|
|
||||||
"""GPT-style multi-head PagedAttention.
|
"""GPT-style multi-head PagedAttention.
|
||||||
|
|
||||||
This class takes query, key, and value tensors as input. The input tensors
|
This class takes query, key, and value tensors as input. The input tensors
|
||||||
|
|||||||
@ -50,7 +50,7 @@ class AWQConfig(QuantizationConfig):
|
|||||||
def get_config_filenames() -> List[str]:
|
def get_config_filenames() -> List[str]:
|
||||||
return [
|
return [
|
||||||
"quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq
|
"quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq
|
||||||
"quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq # pylint: disable=line-too-long
|
"quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
|
||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import torch.nn as nn
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.model_executor.models import * # pylint: disable=wildcard-import
|
from vllm.model_executor.models import *
|
||||||
from vllm.model_executor.weight_utils import (get_quant_config,
|
from vllm.model_executor.weight_utils import (get_quant_config,
|
||||||
initialize_dummy_weights)
|
initialize_dummy_weights)
|
||||||
|
|
||||||
|
|||||||
@ -261,10 +261,7 @@ class AquilaModel(nn.Module):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.embed_tokens(input_ids)
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states = layer(
|
hidden_states = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -281,10 +281,7 @@ class BaiChuanModel(nn.Module):
|
|||||||
hidden_states = self.embed_tokens(input_ids)
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states, residual = layer(
|
hidden_states, residual = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -256,10 +256,7 @@ class BloomModel(nn.Module):
|
|||||||
hidden_states = self.word_embeddings(input_ids)
|
hidden_states = self.word_embeddings(input_ids)
|
||||||
hidden_states = self.word_embeddings_layernorm(hidden_states)
|
hidden_states = self.word_embeddings_layernorm(hidden_states)
|
||||||
for i in range(len(self.h)):
|
for i in range(len(self.h)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.h[i]
|
layer = self.h[i]
|
||||||
hidden_states = layer(
|
hidden_states = layer(
|
||||||
position_ids,
|
position_ids,
|
||||||
|
|||||||
@ -269,10 +269,7 @@ class GLMTransformer(nn.Module):
|
|||||||
cache_events: Optional[List[torch.cuda.Event]],
|
cache_events: Optional[List[torch.cuda.Event]],
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
for i in range(self.num_layers):
|
for i in range(self.num_layers):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states = layer(
|
hidden_states = layer(
|
||||||
hidden_states=hidden_states,
|
hidden_states=hidden_states,
|
||||||
|
|||||||
@ -353,10 +353,7 @@ class FalconModel(nn.Module):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.word_embeddings(input_ids)
|
hidden_states = self.word_embeddings(input_ids)
|
||||||
for i in range(len(self.h)):
|
for i in range(len(self.h)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.h[i]
|
layer = self.h[i]
|
||||||
hidden_states = layer(
|
hidden_states = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -206,10 +206,7 @@ class GPT2Model(nn.Module):
|
|||||||
hidden_states = inputs_embeds + position_embeds
|
hidden_states = inputs_embeds + position_embeds
|
||||||
|
|
||||||
for i in range(len(self.h)):
|
for i in range(len(self.h)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.h[i]
|
layer = self.h[i]
|
||||||
hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
|
hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
|
||||||
cache_event)
|
cache_event)
|
||||||
|
|||||||
@ -225,10 +225,7 @@ class GPTBigCodeModel(nn.Module):
|
|||||||
hidden_states = inputs_embeds + position_embeds
|
hidden_states = inputs_embeds + position_embeds
|
||||||
|
|
||||||
for i in range(len(self.h)):
|
for i in range(len(self.h)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.h[i]
|
layer = self.h[i]
|
||||||
hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
|
hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
|
||||||
cache_event)
|
cache_event)
|
||||||
|
|||||||
@ -147,10 +147,7 @@ class GPTJBlock(nn.Module):
|
|||||||
linear_method: Optional[LinearMethodBase] = None,
|
linear_method: Optional[LinearMethodBase] = None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if config.n_inner is None:
|
inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner
|
||||||
inner_dim = 4 * config.n_embd
|
|
||||||
else:
|
|
||||||
inner_dim = config.n_inner
|
|
||||||
self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||||
self.attn = GPTJAttention(config, linear_method)
|
self.attn = GPTJAttention(config, linear_method)
|
||||||
self.mlp = GPTJMLP(inner_dim, config, linear_method)
|
self.mlp = GPTJMLP(inner_dim, config, linear_method)
|
||||||
@ -205,10 +202,7 @@ class GPTJModel(nn.Module):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.wte(input_ids)
|
hidden_states = self.wte(input_ids)
|
||||||
for i in range(len(self.h)):
|
for i in range(len(self.h)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.h[i]
|
layer = self.h[i]
|
||||||
hidden_states = layer(
|
hidden_states = layer(
|
||||||
position_ids,
|
position_ids,
|
||||||
|
|||||||
@ -216,10 +216,7 @@ class GPTNeoXModel(nn.Module):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.embed_in(input_ids)
|
hidden_states = self.embed_in(input_ids)
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states = layer(
|
hidden_states = layer(
|
||||||
position_ids,
|
position_ids,
|
||||||
|
|||||||
@ -213,10 +213,7 @@ class InternLMModel(nn.Module):
|
|||||||
hidden_states = self.embed_tokens(input_ids)
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states, residual = layer(
|
hidden_states, residual = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -253,10 +253,7 @@ class LlamaModel(nn.Module):
|
|||||||
hidden_states = self.embed_tokens(input_ids)
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states, residual = layer(
|
hidden_states, residual = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -248,10 +248,7 @@ class MistralModel(nn.Module):
|
|||||||
hidden_states = self.embed_tokens(input_ids)
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states, residual = layer(
|
hidden_states, residual = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -203,10 +203,10 @@ class MPTModel(nn.Module):
|
|||||||
self.norm_f = nn.LayerNorm(config.d_model)
|
self.norm_f = nn.LayerNorm(config.d_model)
|
||||||
if config.no_bias:
|
if config.no_bias:
|
||||||
for module in self.modules():
|
for module in self.modules():
|
||||||
if hasattr(module, "bias"):
|
if hasattr(module, "bias") and isinstance(
|
||||||
if isinstance(module.bias, nn.Parameter):
|
module.bias, nn.Parameter):
|
||||||
# Remove the bias term in Linear and LayerNorm.
|
# Remove the bias term in Linear and LayerNorm.
|
||||||
module.register_parameter("bias", None)
|
module.register_parameter("bias", None)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@ -218,10 +218,7 @@ class MPTModel(nn.Module):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.wte(input_ids)
|
hidden_states = self.wte(input_ids)
|
||||||
for i in range(len(self.blocks)):
|
for i in range(len(self.blocks)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
block = self.blocks[i]
|
block = self.blocks[i]
|
||||||
hidden_states = block(
|
hidden_states = block(
|
||||||
position_ids,
|
position_ids,
|
||||||
|
|||||||
@ -257,10 +257,7 @@ class OPTDecoder(nn.Module):
|
|||||||
hidden_states = inputs_embeds + pos_embeds
|
hidden_states = inputs_embeds + pos_embeds
|
||||||
|
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
|
hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
|
||||||
cache_event)
|
cache_event)
|
||||||
|
|||||||
@ -258,10 +258,7 @@ class PhiModel(nn.Module):
|
|||||||
) -> SamplerOutput:
|
) -> SamplerOutput:
|
||||||
hidden_states = self.embd(input_ids)
|
hidden_states = self.embd(input_ids)
|
||||||
for i in range(self.config.num_hidden_layers):
|
for i in range(self.config.num_hidden_layers):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.h[i]
|
layer = self.h[i]
|
||||||
hidden_states = layer(
|
hidden_states = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -213,10 +213,7 @@ class QWenModel(nn.Module):
|
|||||||
hidden_states = self.wte(input_ids)
|
hidden_states = self.wte(input_ids)
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(len(self.h)):
|
for i in range(len(self.h)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.h[i]
|
layer = self.h[i]
|
||||||
hidden_states, residual = layer(
|
hidden_states, residual = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -249,10 +249,7 @@ class YiModel(nn.Module):
|
|||||||
hidden_states = self.embed_tokens(input_ids)
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
if cache_events is None:
|
cache_event = None if cache_events is None else cache_events[i]
|
||||||
cache_event = None
|
|
||||||
else:
|
|
||||||
cache_event = cache_events[i]
|
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
hidden_states, residual = layer(
|
hidden_states, residual = layer(
|
||||||
positions,
|
positions,
|
||||||
|
|||||||
@ -131,11 +131,9 @@ def prepare_hf_model_weights(
|
|||||||
) -> Tuple[str, List[str], bool]:
|
) -> Tuple[str, List[str], bool]:
|
||||||
# Download model weights from huggingface.
|
# Download model weights from huggingface.
|
||||||
is_local = os.path.isdir(model_name_or_path)
|
is_local = os.path.isdir(model_name_or_path)
|
||||||
if use_safetensors:
|
# Some quantized models use .pt files for storing the weights.
|
||||||
allow_patterns = ["*.safetensors"]
|
allow_patterns = ["*.safetensors"
|
||||||
else:
|
] if use_safetensors else ["*.bin", "*.pt"]
|
||||||
# Some quantized models use .pt files for storing the weights.
|
|
||||||
allow_patterns = ["*.bin", "*.pt"]
|
|
||||||
if not is_local:
|
if not is_local:
|
||||||
# Use file lock to prevent multiple processes from
|
# Use file lock to prevent multiple processes from
|
||||||
# downloading the same model weights at the same time.
|
# downloading the same model weights at the same time.
|
||||||
@ -242,7 +240,7 @@ def hf_model_weights_iterator(
|
|||||||
elif use_safetensors:
|
elif use_safetensors:
|
||||||
for st_file in hf_weights_files:
|
for st_file in hf_weights_files:
|
||||||
with safe_open(st_file, framework="pt") as f:
|
with safe_open(st_file, framework="pt") as f:
|
||||||
for name in f.keys():
|
for name in f:
|
||||||
param = f.get_tensor(name)
|
param = f.get_tensor(name)
|
||||||
yield name, param
|
yield name, param
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -2,7 +2,7 @@ from typing import Optional
|
|||||||
|
|
||||||
from transformers import AutoConfig, PretrainedConfig
|
from transformers import AutoConfig, PretrainedConfig
|
||||||
|
|
||||||
from vllm.transformers_utils.configs import * # pylint: disable=wildcard-import
|
from vllm.transformers_utils.configs import *
|
||||||
|
|
||||||
_CONFIG_REGISTRY = {
|
_CONFIG_REGISTRY = {
|
||||||
"aquila": AquilaConfig,
|
"aquila": AquilaConfig,
|
||||||
|
|||||||
@ -62,7 +62,6 @@ class MPTConfig(PretrainedConfig):
|
|||||||
fc_type: str = 'torch',
|
fc_type: str = 'torch',
|
||||||
verbose: Optional[int] = None,
|
verbose: Optional[int] = None,
|
||||||
**kwargs: Any):
|
**kwargs: Any):
|
||||||
# pylint: disable=line-too-long
|
|
||||||
"""The MPT configuration class.
|
"""The MPT configuration class.
|
||||||
Args:
|
Args:
|
||||||
d_model (int): The size of the embedding dimension of the model.
|
d_model (int): The size of the embedding dimension of the model.
|
||||||
@ -139,10 +138,10 @@ class MPTConfig(PretrainedConfig):
|
|||||||
self.init_config = init_config
|
self.init_config = init_config
|
||||||
self.fc_type = fc_type
|
self.fc_type = fc_type
|
||||||
if verbose is not None:
|
if verbose is not None:
|
||||||
warnings.warn(
|
warnings.warn(DeprecationWarning(
|
||||||
DeprecationWarning(
|
'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
|
||||||
'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
|
),
|
||||||
))
|
stacklevel=2)
|
||||||
if 'name' in kwargs:
|
if 'name' in kwargs:
|
||||||
del kwargs['name']
|
del kwargs['name']
|
||||||
if 'loss_fn' in kwargs:
|
if 'loss_fn' in kwargs:
|
||||||
@ -150,8 +149,8 @@ class MPTConfig(PretrainedConfig):
|
|||||||
if self.attn_config.get('alibi', False):
|
if self.attn_config.get('alibi', False):
|
||||||
self.learned_pos_emb = False
|
self.learned_pos_emb = False
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`'
|
f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
|
||||||
)
|
stacklevel=2)
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self._validate_config()
|
self._validate_config()
|
||||||
|
|
||||||
@ -211,7 +210,8 @@ class MPTConfig(PretrainedConfig):
|
|||||||
)
|
)
|
||||||
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'Positional information not being provided to the model.')
|
'Positional information not being provided to the model.',
|
||||||
|
stacklevel=2)
|
||||||
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||||
try:
|
try:
|
||||||
# pylint: disable=import-outside-toplevel
|
# pylint: disable=import-outside-toplevel
|
||||||
|
|||||||
@ -30,7 +30,7 @@ class Counter:
|
|||||||
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
|
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
|
||||||
"""Returns the maximum shared memory per thread block in bytes."""
|
"""Returns the maximum shared memory per thread block in bytes."""
|
||||||
# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
|
# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
|
||||||
cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 # pylint: disable=invalid-name
|
cudaDevAttrMaxSharedMemoryPerBlockOptin = 97
|
||||||
max_shared_mem = cuda_utils.get_device_attribute(
|
max_shared_mem = cuda_utils.get_device_attribute(
|
||||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
|
cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
|
||||||
return int(max_shared_mem)
|
return int(max_shared_mem)
|
||||||
|
|||||||
@ -350,10 +350,7 @@ class Worker:
|
|||||||
self.cache_engine.copy(blocks_to_copy)
|
self.cache_engine.copy(blocks_to_copy)
|
||||||
issued_cache_op = True
|
issued_cache_op = True
|
||||||
|
|
||||||
if issued_cache_op:
|
cache_events = self.cache_events if issued_cache_op else None
|
||||||
cache_events = self.cache_events
|
|
||||||
else:
|
|
||||||
cache_events = None
|
|
||||||
|
|
||||||
# If there is no input, we don't need to execute the model.
|
# If there is no input, we don't need to execute the model.
|
||||||
if not seq_group_metadata_list:
|
if not seq_group_metadata_list:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user