Merge 8b5f93df89eb527945375debd351c5da435613e7 into 9b4e9788e4a3a731f7567338ed15d3ec549ce03b

2026-03-16 10:07:14 +08:00 · 2025-10-25 16:07:09 +00:00 · 2025-10-25 16:07:09 +00:00 · 03962231ec
commit 03962231ec
parent 9b4e9788e4 8b5f93df89
2 changed files with 174 additions and 0 deletions
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@ -0,0 +1,39 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python application
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
--- a/inference/text_chat_server.py
+++ b/inference/text_chat_server.py
@ -0,0 +1,135 @@
+"""
+import os
+import json
+import glob
+import threading
+from flask import Flask, request, Response, render_template_string
+import torch
+
+from transformers import AutoTokenizer
+
+from model import Transformer, ModelArgs
+from generate import generate
+
+app = Flask(__name__)
+
+_lock = threading.Lock()
+_backend = {"loaded": False, "model": None, "tokenizer": None, "args": None, "device": "cpu"}
+
+TEXT_CHAT_HTML = """<!doctype html>
+<html>
+  <head><meta charset="utf-8"><title>DeepSeek Text Chat (no-JS)</title></head>
+  <body>
+    <h2>DeepSeek — text-only chat</h2>
+    <form method="post" action="/text-chat">
+      <label for="message">Message:</label><br>
+      <textarea id="message" name="message" rows="4" cols="60">{{message}}</textarea><br>
+      <input type="submit" value="Send">
+    </form>
+    {% if reply %}
+    <h3>Reply</h3>
+    <pre>{{reply}}</pre>
+    {% endif %}
+    <p>If this server is not configured with local model weights, use the /api/text-chat endpoint against your deployment.</p>
+  </body>
+</html>
+"""
+
+
+def _find_checkpoint_file(ckpt_dir):
+    patterns = ["model*.safetensors", "model*-mp*.safetensors", "*.safetensors"]
+    for p in patterns:
+        matches = glob.glob(os.path.join(ckpt_dir, p))
+        if matches:
+            return matches[0]
+    return None
+
+
+def _load_backend():
+    with _lock:
+        if _backend["loaded"]:
+            return
+        ckpt_path = os.getenv("DEEPSEEK_CKPT_PATH")
+        config_path = os.getenv("DEEPSEEK_CONFIG_PATH")
+        if not ckpt_path or not config_path:
+            _backend["loaded"] = False
+            return
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        try:
+            with open(config_path, "r") as f:
+                args = ModelArgs(**json.load(f))
+            model = Transformer(args)
+            tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
+            ckpt_file = _find_checkpoint_file(ckpt_path)
+            if ckpt_file:
+                try:
+                    from safetensors.torch import load_model as _load_model
+                    _load_model(model, ckpt_file)
+                except Exception:
+                    # best-effort load; if it fails we keep backend unloaded
+                    _backend["loaded"] = False
+                    return
+            else:
+                _backend["loaded"] = False
+                return
+            _backend.update({"loaded": True, "model": model, "tokenizer": tokenizer, "args": args, "device": device})
+        except Exception:
+            _backend["loaded"] = False
+
+
+def _ensure_backend():
+    if not _backend["loaded"]:
+        _load_backend()
+
+
+def _generate_reply(message, max_new_tokens=200, temperature=1.0):
+    _ensure_backend()
+    if not _backend["loaded"] or _backend["model"] is None or _backend["tokenizer"] is None:
+        raise RuntimeError("Local model backend not configured or failed to load. Set DEEPSEEK_CKPT_PATH and DEEPSEEK_CONFIG_PATH environment variables with a valid local checkpoint and config if you want local generation.")
+    tokenizer = _backend["tokenizer"]
+    model = _backend["model"]
+    try:
+        prompt_tokens = tokenizer.apply_chat_template([{"role": "user", "content": message}], add_generation_prompt=True)
+    except Exception:
+        prompt_tokens = tokenizer.encode(message)
+    completion_tokens = generate(model, [prompt_tokens], max_new_tokens, getattr(tokenizer, "eos_token_id", -1), temperature)
+    try:
+        completion = tokenizer.decode(completion_tokens[0], skip_special_tokens=True)
+    except Exception:
+        completion = " ".join(map(str, completion_tokens[0]))
+    return completion
+
+
+@app.route("/api/text-chat", methods=["POST"])
+def api_text_chat():
+    if request.is_json:
+        payload = request.get_json()
+        message = payload.get("message") if isinstance(payload, dict) else None
+    else:
+        message = request.form.get("message") or request.values.get("message")
+    if not message:
+        return Response("Missing 'message' parameter", status=400, mimetype="text/plain; charset=utf-8")
+    try:
+        reply = _generate_reply(message)
+    except RuntimeError as e:
+        return Response(str(e), status=503, mimetype="text/plain; charset=utf-8")
+    return Response(reply, status=200, mimetype="text/plain; charset=utf-8")
+
+
+@app.route("/text-chat", methods=["GET", "POST"])
+def text_chat_page():
+    if request.method == "GET":
+        return render_template_string(TEXT_CHAT_HTML, message="", reply=None)
+    message = request.form.get("message", "")
+    if not message:
+        return render_template_string(TEXT_CHAT_HTML, message=message, reply="No message provided")
+    try:
+        reply = _generate_reply(message)
+    except RuntimeError as e:
+        return render_template_string(TEXT_CHAT_HTML, message=message, reply=str(e))
+    return render_template_string(TEXT_CHAT_HTML, message=message, reply=reply)
+
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=int(os.getenv("PORT", "5000")))
+"""