diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index cc9a839b87ef5..2a41d43ab9660 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -88,7 +88,6 @@ Image.MAX_IMAGE_PIXELS = None  # Disable the limit entirely
 # Image.MAX_IMAGE_PIXELS = 300000000  # ~300M pixels
 
 
-# TODO(nhaber): get 2048 from config
 # TODO(nhaber): does use_thumbnail=True work?
 # TODO(nhaber): mixing images and videos will mess up the "text_prompt_length" calculation.
 
@@ -102,28 +101,20 @@ IMG_CONTEXT = "<image>"
 DEFAULT_NUM_TILES = 12
 
 
-@dataclass(kw_only=True, frozen=True)
-class Dims:
-    height: int
-    width: int
-  
-
-CONV_MERGING = False # This is assumed to be False for now
-PIXEL_SHUFFLE = True # This is assumed to be True for now
-REDUCTION_FACTOR = 2 ** (PIXEL_SHUFFLE + CONV_MERGING)
-
-def num_image_token_per_tile(*, tile_dims: Dims, patch_size: int, downsample_ratio: int) -> int:
-    tile_size = math.sqrt(tile_dims.width * tile_dims.height)
-    num_tokens = int(
-        (tile_size // patch_size) ** 2 * (downsample_ratio**2)
-    )
+def num_image_token_per_tile(
+    *, width: int, height: int, patch_size: int, downsample_ratio: int
+) -> int:
+    tile_size = math.sqrt((width // patch_size) * (height // patch_size))
+    num_tokens = int(tile_size**2 // (downsample_ratio**2))
     return num_tokens
 
+
 def width_and_height_for_max_num_tokens_available(
     *,
     target_num_tokens_post_shuffle: int,
     patch_size: int,
-) -> Dims:
+    downsample_ratio: int,
+) -> tuple[int, int]:
     """
     TODO(nhaber): optimize this so it squeezes closer to target number of tokens.
     Calculate image dimensions that produce approximately `target` tokens after
@@ -133,14 +124,26 @@ def width_and_height_for_max_num_tokens_available(
     need 4*B patches to get B tokens.
 
     Examples:
-        >>> dims = width_and_height_for_max_num_tokens_available(B=8192, patch_size=16)
-        >>> assert dims.width, dims.height == (2880, 2880)
-        >>> assert ((dims.width // 16) * (dims.height // 16) // 4) == 8100 # tokens after shuffle
-        >>> assert num_image_token_per_tile(tile_dims=dims, patch_size=16, downsample_ratio=2) == 8100
+    >>> width, height = width_and_height_for_max_num_tokens_available(
+    ...     target_num_tokens_post_shuffle=8192,
+    ...     patch_size=16,
+    ...     downsample_ratio=2,
+    ... )
+    >>> assert width, height == (2880, 2880)
+    >>> assert (width // 16) * (height // 16) // 2**2 == 8100  # tokens post-shuffle
+    >>> assert (
+    ...     num_image_token_per_tile(
+    ...         width=width, height=height, patch_size=16, downsample_ratio=2
+    ...     )
+    ...     == 8100
+    ... )
     """
-    side_pixels = math.isqrt(target_num_tokens_post_shuffle) * REDUCTION_FACTOR * patch_size
+    side_pixels = (
+        math.isqrt(target_num_tokens_post_shuffle) * downsample_ratio * patch_size
+    )
     assert isinstance(side_pixels, int) and side_pixels % patch_size == 0
-    return Dims(width=side_pixels, height=side_pixels)
+    return side_pixels, side_pixels
+
 
 @dataclass
 class DynamicResolutionParams:
@@ -354,7 +357,7 @@ class BaseNanoNemotronVLProcessor(ABC):
         self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
         image_size: int = config.force_image_size
         self.patch_size: int = getattr(config, "patch_size", 16)
-        self.downsample_ratio: float = self.config.downsample_ratio
+        # self.downsample_ratio: float = self.config.downsample_ratio
 
         self.image_size = image_size
         self.use_thumbnail: bool = config.use_thumbnail
@@ -392,9 +395,10 @@ class BaseNanoNemotronVLProcessor(ABC):
         )
 
         return num_tiles * num_image_token_per_tile(
-            tile_dims=Dims(width=image_width, height=image_height),
+            width=image_width,
+            height=image_height,
             patch_size=self.patch_size,
-            downsample_ratio=self.downsample_ratio
+            downsample_ratio=self.downsample_ratio,
         )
 
     def _images_to_pixel_values_lst(
@@ -508,8 +512,9 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
         super().__init__(
             config=config, tokenizer=tokenizer, max_num_tiles=max_num_tiles, **kwargs
         )
-        self.max_model_len = max_model_len
 
+        self._patch_size: int = getattr(config, "patch_size", 16)
+        self.max_model_len = max_model_len
         self._min_num_patches = min_num_patches
         self._factor_max = factor_max
         self._pixel_shuffle = pixel_shuffle
@@ -518,47 +523,90 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
         self._use_thumbnail = use_thumbnail
         self._thumbnail_size = thumbnail_size
         self._thumbnail_area_threshold = thumbnail_area_threshold
+        self.norm_mean = torch.tensor(self.CLIP_PIXEL_MEAN).reshape(1, 3, 1, 1)
+        self.norm_std = torch.tensor(self.CLIP_PIXEL_STD).reshape(1, 3, 1, 1)
         self._transform = T.Compose(
             [
                 T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
                 T.ToTensor(),  # T.Lambda(lambda img: _fast_to_tensor(img)),
+                # T.Normalize(mean=pixel_mean, std=pixel_std), - This is done down below with input_conditioner
             ]
         )
         self._apply_data_augment = apply_data_augment
+        reduction_factor = 1 / self.config.downsample_ratio
+        assert reduction_factor == 2.0, (
+            "I don't understand what's going on if this isn't 4"
+        )
+        self.downsample_ratio = int(reduction_factor) ** (pixel_shuffle + conv_merging)
+        assert self.downsample_ratio == 2, (
+            f"I don't understand what's going on if {self.downsample_ratio=} isn't 2"
+        )
 
-        self.norm_mean = torch.tensor(self.CLIP_PIXEL_MEAN).reshape(1, 3, 1, 1)
-        self.norm_std = torch.tensor(self.CLIP_PIXEL_STD).reshape(1, 3, 1, 1)
-        self.downsample_ratio = 2 if pixel_shuffle else 1
+    def _get_num_embeddings(self, width: int, height: int) -> int:
+        return num_image_token_per_tile(
+            width=width,
+            height=height,
+            patch_size=self._patch_size,
+            downsample_ratio=self.downsample_ratio,
+        )
+
+    def max_num_tokens_available(self, text_prompt_length: int) -> int:
+        return self.max_model_len - text_prompt_length - 4
+
+    def _images_to_pixel_values_lst(
+        self,
+        text_prompt_length: int,
+        images: list[Image.Image],
+        max_num_tiles: int,
+    ) -> tuple[list[torch.Tensor], list[int]]:
+        num_tokens_available = self.max_num_tokens_available(text_prompt_length)
+        params_per_image = self.compute_params(images, num_tokens_available)
+
+        feature_sizes = []
+        images = []
+        for param in params_per_image:
+            for t in self.apply_params(param):
+                if t.ndim == 3:
+                    t = t.unsqueeze(0)
+                images.append(t)
+                feature_sizes.append(param.num_embeddings)
+        print(f"{feature_sizes=}")
+        print(f"{params_per_image=}")
+        return images, feature_sizes
 
     feature_size_cache: dict[
         Image.Image, int
-    ] = {}  # TODO(nhaber): Find a less silly way of doing this... Why can't this be a class variable?
+    ] = {}  # TODO(nhaber): Find a less silly way of doing this... Why can't this be an instance variable?
 
-    def apply_params(self, params: DynamicResolutionParams) -> torch.Tensor:
+    def get_cached_feature_size(self, image: Image.Image) -> int:
+        feature_size = self.feature_size_cache[id(image)]
+        del self.feature_size_cache[id(image)]
+        return feature_size
+
+    def apply_params(self, params: DynamicResolutionParams) -> list[torch.Tensor]:
         resized_img = params.media.resize(
             (
-                params.patch_size[0] * self.patch_size,
-                params.patch_size[1] * self.patch_size,
+                params.patch_size[0] * self._patch_size,
+                params.patch_size[1] * self._patch_size,
             )
         )
-        # processed_images = [resized_img]
+        processed_images = [resized_img]
 
-        # # Add thumbnail if enabled and image area is below threshold
-        # if self._use_thumbnail:
-        #     # Calculate areas
-        #     resized_area = resized_img.size[0] * resized_img.size[1]
-        #     thumbnail_area = self._thumbnail_size * self._thumbnail_size
-        #     area_ratio = resized_area / thumbnail_area
+        # Add thumbnail if enabled and image area is below threshold
+        if self._use_thumbnail:
+            # Calculate areas
+            resized_area = resized_img.size[0] * resized_img.size[1]
+            thumbnail_area = self._thumbnail_size * self._thumbnail_size
+            area_ratio = resized_area / thumbnail_area
 
-        #     # Only add thumbnail if resized image area is less than threshold % of
-        #     # thumbnail area
-        #     if area_ratio < self._thumbnail_area_threshold:
-        #         thumbnail_img = params.media.resize(
-        #             (self._thumbnail_size, self._thumbnail_size)
-        #         )
-        #         processed_images.append(thumbnail_img)
+            # Only add thumbnail if resized image area is less than threshold % of thumbnail area
+            if area_ratio < self._thumbnail_area_threshold:
+                thumbnail_img = params.media.resize(
+                    (self._thumbnail_size, self._thumbnail_size)
+                )
+                processed_images.append(thumbnail_img)
 
-        return self._transform(resized_img)
+        return [self._transform(img) for img in processed_images]
 
     def process_media(
         self,
@@ -568,11 +616,11 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
         tiling_augment_prob: float = 0.4,
     ) -> tuple[DynamicResolutionParams, int]:
         """Process a single media item and return its parameters.
+
         Args:
             media: The media item to process
             num_tokens_available: Number of tokens available for this media
-            data_augment: Whether to apply data augmentation to the image. Defaults to
-            False.
+            data_augment: Whether to apply data augmentation to the image. Defaults to False.
         Returns:
             DynamicResolutionParams for the media
         """
@@ -581,11 +629,9 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
             "Dynamic resolution is only supported for image media"
         )
         orig_width, orig_height = media.width, media.height
-
-        closest_patch_height = math.ceil(
-            orig_height / self.patch_size
-        )  # TODO(nhaber): Ask Tyler - the previous round + 0.5 code is dangerous [banker's rounding], no? If we flip this back to the round, the max_wh_fill_budget needs to do -1 for each of w;h to be safe
-        closest_patch_width = math.ceil(orig_width / self.patch_size)
+        # TODO(nhaber): Ask Tyler - the round + 0.5 code is dangerous [banker's rounding], no?
+        closest_patch_height = round(orig_height / self._patch_size + 0.5)
+        closest_patch_width = round(orig_width / self._patch_size + 0.5)
         patches = closest_patch_height * closest_patch_width
 
         factor = min(
@@ -594,8 +640,7 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
         target_patch_height = math.floor(factor * closest_patch_height)
         target_patch_width = math.floor(factor * closest_patch_width)
 
-        # We only consider self._min_num_patches if it is greater than
-        # current_num_tokens_available.
+        # We only consider self._min_num_patches if it is greater than current_num_tokens_available.
         if (
             current_num_tokens_available > self._min_num_patches
             and target_patch_height * target_patch_width < self._min_num_patches
@@ -608,20 +653,19 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
 
         if (
             self._min_side is not None
-            and min(target_patch_width, target_patch_height) * self.patch_size
+            and min(target_patch_width, target_patch_height) * self._patch_size
             < self._min_side
         ):
             if target_patch_width <= target_patch_height:
-                up_factor = self._min_side / (target_patch_width * self.patch_size)
+                up_factor = self._min_side / (target_patch_width * self._patch_size)
                 new_patch_height = math.ceil(up_factor * target_patch_height)
                 new_patch_width = math.ceil(up_factor * target_patch_width)
 
                 if new_patch_height * new_patch_width > current_num_tokens_available:
-                    # If only one side can be min_side, make as big as possible at
-                    # native aspect ratio while staying below max_patches
+                    # If only one side can be min_side, make as big as possible at native aspect ratio while staying below max_patches
                     if (
                         max(current_num_tokens_available // new_patch_width, 1)
-                        * self.patch_size
+                        * self._patch_size
                         < self._min_side
                     ):
                         up_factor = math.sqrt(
@@ -640,16 +684,15 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
                     target_patch_height = new_patch_height
                     target_patch_width = new_patch_width
             else:
-                up_factor = self._min_side / (target_patch_height * self.patch_size)
+                up_factor = self._min_side / (target_patch_height * self._patch_size)
                 new_patch_height = math.ceil(up_factor * target_patch_height)
                 new_patch_width = math.ceil(up_factor * target_patch_width)
 
                 if new_patch_height * new_patch_width > current_num_tokens_available:
-                    # If only one side can be min_side, make as big as possible at
-                    # native aspect ratio while staying below max_patches
+                    # If only one side can be min_side, make as big as possible at native aspect ratio while staying below max_patches
                     if (
                         max(current_num_tokens_available // new_patch_height, 1)
-                        * self.patch_size
+                        * self._patch_size
                         < self._min_side
                     ):
                         up_factor = math.sqrt(
@@ -708,15 +751,10 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
                 target_patch_width, target_patch_height, current_num_tokens_available
             )
 
-        assert isinstance(media, Image.Image), (
-            "Dynamic resolution is only supported for image media"
-        )
-
         # Calculate embeddings for the main dynamic resolution image
-        num_embeddings_per_tile = num_image_token_per_tile(
-            tile_dims=Dims(width=target_patch_width, height=target_patch_height),
-            patch_size=self.patch_size,
-            downsample_ratio=self.downsample_ratio
+        num_embeddings = self._get_num_embeddings(
+            target_patch_width * self._patch_size,
+            target_patch_height * self._patch_size,
         )
 
         token_count = target_patch_width * target_patch_height
@@ -725,33 +763,30 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
         num_tiles = 1  # Base dynamic resolution image
         if self._use_thumbnail:
             # Calculate areas
-            resized_area = (target_patch_width * self.patch_size) * (
-                target_patch_height * self.patch_size
+            resized_area = (target_patch_width * self._patch_size) * (
+                target_patch_height * self._patch_size
             )
             thumbnail_area = self._thumbnail_size * self._thumbnail_size
             area_ratio = resized_area / thumbnail_area
 
-            # Only add thumbnail if resized image area is less than threshold % of
-            # thumbnail area
+            # Only add thumbnail if resized image area is less than threshold % of thumbnail area
             if area_ratio < self._thumbnail_area_threshold:
                 num_tiles += 1  # Add 1 for thumbnail
                 # Add embeddings for thumbnail (thumbnail_size x thumbnail_size)
-                num_embeddings_per_tile += num_image_token_per_tile(
-                    tile_dims=Dims(width=self._thumbnail_size, height=self._thumbnail_size),
-                    patch_size=self.patch_size,
-                    downsample_ratio=self.downsample_ratio
+                num_embeddings += self._get_num_embeddings(
+                    self._thumbnail_size, self._thumbnail_size
                 )
                 token_count += (
                     self._thumbnail_size
-                    // self.patch_size
+                    // self._patch_size
                     * self._thumbnail_size
-                    // self.patch_size
+                    // self._patch_size
                 )
 
         return DynamicResolutionParams(
             media=media,
             num_tiles=num_tiles,
-            num_embeddings=num_embeddings_per_tile,
+            num_embeddings=num_embeddings,
             patch_size=(target_patch_width, target_patch_height),
         ), token_count
 
@@ -805,7 +840,7 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
         media_list: list[Image.Image],
         num_tokens_available: int | None = None,
         data_augment: bool = False,
-    ) -> tuple[list[DynamicResolutionParams], list[int]]:
+    ) -> list[DynamicResolutionParams]:
         """Compute parameters for all media with iterative token budgeting.
 
         Args:
@@ -821,26 +856,24 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
             * (4 if self._pixel_shuffle else 1)
             * (4 if self._conv_merging else 1)
         )
-        # When the number of available token is too small, allow self._min_num_patches
-        # per media and let the sample be truncated.
+        # When the number of available token is too small, allow self._min_num_patches per media and
+        # let the sample be truncated.
         num_tokens_available = max(
             num_tokens_available, self._min_num_patches * len(media_list)
         )
 
-        # Clip the number of tokens available per media to be between min and max
-        # patches.
+        # Clip the number of tokens available per media to be between min and max patches.
         num_tokens_available_per_media = [
             max(num_tokens_available, self._min_num_patches)
             for _ in range(len(media_list))
         ]
 
-        # In theory this could be a while True loop, but in case the process_media
-        # method slightly
+        # In theory this could be a while True loop, but in case the process_media method slightly
         # changes, I want to make sure we don't get stuck in an infinite loop.
         for _ in range(10):
             # Step 1: Process each media with current token budget
-            params: list[DynamicResolutionParams] = []
-            token_counts: list[int] = []
+            params = []
+            token_counts = []
 
             for media, tokens_for_media in zip(
                 media_list, num_tokens_available_per_media
@@ -850,18 +883,14 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
                 )
                 params.append(param)
                 token_counts.append(token_count)
+                self.feature_size_cache[id(param.media)] = param.num_embeddings
 
             # Step 2: Check if total tokens is within budget
             total_tokens = sum(token_counts)
 
             if total_tokens <= num_tokens_available:
                 # We're within budget, return the params
-                # Convert from patch count to actual token count after downsampling
-                divisor = (4 if self._pixel_shuffle else 1) * (4 if self._conv_merging else 1)
-                adjusted_token_counts = [tc // divisor for tc in token_counts]
-                for param, feature_size in zip(params, adjusted_token_counts, strict=True):
-                    self.feature_size_cache[id(param.media)] = feature_size
-                return params, adjusted_token_counts
+                return params
 
             # Step 3: We're over budget, need to scale down
             # Calculate scaling factor to get under budget
@@ -880,8 +909,8 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
                     for i in range(len(num_tokens_available_per_media))
                 ]
             )
-            # If there was not scaling down, we're stuck just use min_num_patches per
-            # media, else try with the scaled down num_tokens_available_per_media.
+            # If there was not scaling down, we're stuck just use min_num_patches per media, else
+            # try with the scaled down num_tokens_available_per_media.
             if not scaled_down:
                 num_tokens_available_per_media = [self._min_num_patches] * len(
                     media_list
@@ -900,15 +929,15 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
         )
 
         def rearrange_img(x):
-            py = x.shape[-2] // self.patch_size
-            px = x.shape[-1] // self.patch_size
+            py = x.shape[-2] // self._patch_size
+            px = x.shape[-1] // self._patch_size
             x = einops.rearrange(
                 x,
                 "c (py yy) (px xx) -> (py px) (c yy xx)",
                 py=py,
-                yy=self.patch_size,
+                yy=self._patch_size,
                 px=px,
-                xx=self.patch_size,
+                xx=self._patch_size,
             )
             return x
 
@@ -941,34 +970,6 @@ class DynamicResolutionImageTiler(BaseNanoNemotronVLProcessor):
                 None,
             )
 
-    def max_num_tokens_available(self, text_prompt_length: int) -> int:
-        return self.max_model_len - text_prompt_length - 4
-
-    def _images_to_pixel_values_lst(
-        self,
-        text_prompt_length: int,
-        images: list[Image.Image],
-        max_num_tiles: int,
-    ) -> tuple[list[torch.Tensor], list[int]]:
-        num_tokens_available = self.max_num_tokens_available(text_prompt_length)
-        params_per_image, feature_sizes = self.compute_params(
-            images, num_tokens_available
-        )
-        print(f"{feature_sizes=}")
-        print(f"{params_per_image=}")
-        images = []
-        for param in params_per_image:
-            t = self.apply_params(param)
-            if t.ndim == 3:
-                t = t.unsqueeze(0)
-            images.append(t)
-        return images, feature_sizes
-
-    def get_cached_feature_size(self, image: Image.Image) -> int:
-        feature_size = self.feature_size_cache[id(image)]
-        del self.feature_size_cache[id(image)]
-        return feature_size
-
 
 class NanoNemotronVLProcessor(DynamicResolutionImageTiler):
     """
@@ -1339,12 +1340,11 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
         processor = self.get_hf_processor()  # we get the CustomProcessor here
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = (
-            seq_len - max_image_tokens
-        ) // num_image_token_per_tile(
-            tile_dims=Dims(width=256, height=256),
-            patch_size=processor.patch_size,
-            downsample_ratio=processor.downsample_ratio
+        max_total_frames = (seq_len - max_image_tokens) // num_image_token_per_tile(
+            width=256,
+            height=256,
+            patch_size=processor._patch_size,
+            downsample_ratio=processor.downsample_ratio,
         )  # TODO(nhaber): get 256 dynamically
         max_frames_per_video = max_total_frames // max(max_videos, 1)
         return max(max_frames_per_video, 1)
@@ -1483,9 +1483,10 @@ class NanoNemotronVLMultiModalProcessor(
 
         def get_video_replacement_internvl(item_idx: int):
             feature_size = num_image_token_per_tile(
-                tile_dims=Dims(width=256, height=256),
-                patch_size=hf_processor.patch_size,
-                downsample_ratio=hf_processor.downsample_ratio
+                width=256,
+                height=256,
+                patch_size=hf_processor._patch_size,
+                downsample_ratio=hf_processor.downsample_ratio,
             )  # TODO(nhaber): get 256 dynamically
             video, metadata = mm_items["video"][item_idx]
             num_patches = video_num_patches[item_idx]
@@ -1550,17 +1551,18 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         num_images = mm_counts.get("image", 0)
         processor = self.info.get_hf_processor()
         B = processor.max_num_tokens_available(text_prompt_length=num_images)
-        target_dims = width_and_height_for_max_num_tokens_available(
+        target_width, target_height = width_and_height_for_max_num_tokens_available(
             target_num_tokens_post_shuffle=B,
-            patch_size=processor.patch_size,
+            patch_size=processor._patch_size,
+            downsample_ratio=processor.downsample_ratio,
         )
 
         image_overrides = mm_options.get("image") if mm_options else None
 
         return {
             "image": self._get_dummy_images(
-                width=target_dims.width,
-                height=target_dims.height,
+                width=target_width,
+                height=target_height,
                 num_images=num_images,
                 overrides=image_overrides,
             )