rename folder and cleanup namespace

Signed-off-by: Qubitium <[email protected]>
ModelCloud · Mar 2, 2025 · 4684c2e · 4684c2e
1 parent 25f1607
commit 4684c2e
Show file tree

Hide file tree

Showing 22 changed files with 6 additions and 22 deletions.
diff --git a/gptqmodel/nn_modules/qlinear/exllama_eora.py b/gptqmodel/nn_modules/qlinear/exllama_eora.py
@@ -54,7 +54,7 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
 
 
 class ExllamaEoraQuantLinear(BaseQuantLinear):
-    SUPPORTS_BITS = [4, 8]
+    SUPPORTS_BITS = [4] # fused eora only validated for 4 bits
     SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128]
     SUPPORTS_DESC_ACT = [True, False]
     SUPPORTS_SYM = [True] # TODO: validate False

diff --git a/gptqmodel_ext/exllama2-vllm/.gitignore → gptqmodel_ext/exllama_eora/.gitignore b/gptqmodel_ext/exllama2-vllm/.gitignore → gptqmodel_ext/exllama_eora/.gitignore
diff --git a/gptqmodel_ext/exllama2-vllm/README.md → gptqmodel_ext/exllama_eora/README.md b/gptqmodel_ext/exllama2-vllm/README.md → gptqmodel_ext/exllama_eora/README.md
diff --git a/gptqmodel_ext/exllama2-vllm/benchmark.py → gptqmodel_ext/exllama_eora/benchmark.py b/gptqmodel_ext/exllama2-vllm/benchmark.py → gptqmodel_ext/exllama_eora/benchmark.py
diff --git a/gptqmodel_ext/exllama2-vllm/eora/__init__.py → gptqmodel_ext/exllama_eora/eora/__init__.py b/gptqmodel_ext/exllama2-vllm/eora/__init__.py → gptqmodel_ext/exllama_eora/eora/__init__.py
diff --git a/gptqmodel_ext/exllama2-vllm/eora/compat.cuh → gptqmodel_ext/exllama_eora/eora/compat.cuh b/gptqmodel_ext/exllama2-vllm/eora/compat.cuh → gptqmodel_ext/exllama_eora/eora/compat.cuh
@@ -5,7 +5,6 @@ Copied from https://github.com/turboderp/exllamav2
 #ifndef _compat_cuh
 #define _compat_cuh
 
-namespace vllm {
 namespace gptq {
 // atomicAdd for half types, to support CC < 7.x
 
@@ -60,5 +59,4 @@ __device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
 #endif
 
 }  // namespace gptq
-}  // namespace vllm
 #endif
diff --git a/...el_ext/exllama2-vllm/eora/matrix_view.cuh → ...del_ext/exllama_eora/eora/matrix_view.cuh b/...el_ext/exllama2-vllm/eora/matrix_view.cuh → ...del_ext/exllama_eora/eora/matrix_view.cuh
@@ -11,7 +11,6 @@ https://github.com/turboderp/exllama
 
 #include "qdq_util.cuh"
 
-namespace vllm {
 namespace gptq {
 
 class MatrixView_half {
@@ -291,5 +290,4 @@ class MatrixView_q8_row {
 };
 
 }  // namespace gptq
-}  // namespace vllm
 #endif
diff --git a/gptqmodel_ext/exllama2-vllm/eora/ops.h → gptqmodel_ext/exllama_eora/eora/ops.h b/gptqmodel_ext/exllama2-vllm/eora/ops.h → gptqmodel_ext/exllama_eora/eora/ops.h
diff --git a/gptqmodel_ext/exllama2-vllm/eora/pybind.cu → gptqmodel_ext/exllama_eora/eora/pybind.cu b/gptqmodel_ext/exllama2-vllm/eora/pybind.cu → gptqmodel_ext/exllama_eora/eora/pybind.cu
diff --git a/gptqmodel_ext/exllama2-vllm/eora/q_gemm.cu → gptqmodel_ext/exllama_eora/eora/q_gemm.cu b/gptqmodel_ext/exllama2-vllm/eora/q_gemm.cu → gptqmodel_ext/exllama_eora/eora/q_gemm.cu
@@ -19,7 +19,6 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #include "qdq_4.cuh"
 #include "qdq_8.cuh"
 
-namespace vllm {
 namespace gptq {
 
 #define BLOCK_KN_SIZE 128
@@ -2074,7 +2073,6 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 }
 
 }  // namespace gptq
-}  // namespace vllm
 
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
@@ -2086,7 +2084,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
   at::Tensor temp_dq = torch::empty(
       {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
 
-  vllm::gptq::gemm_half_q_half_cuda(
+ gptq::gemm_half_q_half_cuda(
       at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
       (const uint32_t*)b_q_weight.data_ptr(),
       (const uint32_t*)b_gptq_qzeros.data_ptr(),
@@ -2112,7 +2110,7 @@ torch::Tensor gptq_gemm_lora(torch::Tensor a, torch::Tensor b_q_weight,
     at::Tensor temp_dq = torch::empty(
             {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
 
-    vllm::gptq::gemm_half_q_half_cuda_eora(
+    gptq::gemm_half_q_half_cuda_eora(
             at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
             (const uint32_t*)b_q_weight.data_ptr(),
             (const uint32_t*)b_gptq_qzeros.data_ptr(),
@@ -2133,7 +2131,7 @@ torch::Tensor gptq_gemm_lora(torch::Tensor a, torch::Tensor b_q_weight,
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
-  vllm::gptq::shuffle_exllama_weight(
+  gptq::shuffle_exllama_weight(
       (uint32_t*)q_weight.data_ptr(),
       q_perm.device().is_meta() || q_perm.numel() == 0
           ? NULL

diff --git a/...ext/exllama2-vllm/eora/q_gemm_original.cu → ..._ext/exllama_eora/eora/q_gemm_original.cu b/...ext/exllama2-vllm/eora/q_gemm_original.cu → ..._ext/exllama_eora/eora/q_gemm_original.cu
diff --git a/gptqmodel_ext/exllama2-vllm/eora/qdq_2.cuh → gptqmodel_ext/exllama_eora/eora/qdq_2.cuh b/gptqmodel_ext/exllama2-vllm/eora/qdq_2.cuh → gptqmodel_ext/exllama_eora/eora/qdq_2.cuh
@@ -7,7 +7,6 @@ Copied from https://github.com/turboderp/exllamav2
 
 #include "qdq_util.cuh"
 
-namespace vllm {
 namespace gptq {
 
 // Permutation:
@@ -71,6 +70,5 @@ __forceinline__ __device__ void dequant_2bit_16(const uint32_t q_0,
 }
 
 }  // namespace gptq
-}  // namespace vllm
 
 #endif
diff --git a/gptqmodel_ext/exllama2-vllm/eora/qdq_3.cuh → gptqmodel_ext/exllama_eora/eora/qdq_3.cuh b/gptqmodel_ext/exllama2-vllm/eora/qdq_3.cuh → gptqmodel_ext/exllama_eora/eora/qdq_3.cuh
@@ -3,7 +3,6 @@
 
 #include "qdq_util.cuh"
 
-namespace vllm {
 namespace gptq {
 // Permutation:
 //
@@ -144,6 +143,5 @@ __forceinline__ __device__ void dequant_3bit_32(const uint32_t q_0,
 }
 
 }  // namespace gptq
-}  // namespace vllm
 
 #endif
diff --git a/gptqmodel_ext/exllama2-vllm/eora/qdq_4.cuh → gptqmodel_ext/exllama_eora/eora/qdq_4.cuh b/gptqmodel_ext/exllama2-vllm/eora/qdq_4.cuh → gptqmodel_ext/exllama_eora/eora/qdq_4.cuh
@@ -7,7 +7,6 @@ Copied from https://github.com/turboderp/exllamav2
 
 #include "qdq_util.cuh"
 
-namespace vllm {
 namespace gptq {
 // Permutation:
 //
@@ -121,6 +120,5 @@ __forceinline__ __device__ void dequant_4bit_8_gptq(const uint32_t q_0,
   }
 }
 }  // namespace gptq
-}  // namespace vllm
 
 #endif
diff --git a/gptqmodel_ext/exllama2-vllm/eora/qdq_8.cuh → gptqmodel_ext/exllama_eora/eora/qdq_8.cuh b/gptqmodel_ext/exllama2-vllm/eora/qdq_8.cuh → gptqmodel_ext/exllama_eora/eora/qdq_8.cuh
@@ -7,7 +7,6 @@ Copied from https://github.com/turboderp/exllamav2
 
 #include "qdq_util.cuh"
 
-namespace vllm {
 namespace gptq {
 
 __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
@@ -25,6 +24,5 @@ __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
 }
 
 }  // namespace gptq
-}  // namespace vllm
 
 #endif
diff --git a/...model_ext/exllama2-vllm/eora/qdq_util.cuh → gptqmodel_ext/exllama_eora/eora/qdq_util.cuh b/...model_ext/exllama2-vllm/eora/qdq_util.cuh → gptqmodel_ext/exllama_eora/eora/qdq_util.cuh
@@ -5,7 +5,6 @@ Copied from https://github.com/turboderp/exllamav2
 #ifndef _qdq_util_cuh
 #define _qdq_util_cuh
 
-namespace vllm {
 namespace gptq {
 
 union half2_uint32 {
@@ -52,5 +51,4 @@ __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0,
 }
 
 }  // namespace gptq
-}  // namespace vllm
 #endif
diff --git a/gptqmodel_ext/exllama2-vllm/requirements.txt → gptqmodel_ext/exllama_eora/requirements.txt b/gptqmodel_ext/exllama2-vllm/requirements.txt → gptqmodel_ext/exllama_eora/requirements.txt
diff --git a/gptqmodel_ext/exllama2-vllm/setup.py → gptqmodel_ext/exllama_eora/setup.py b/gptqmodel_ext/exllama2-vllm/setup.py → gptqmodel_ext/exllama_eora/setup.py
diff --git a/...el_ext/exllama2-vllm/test_actual_value.py → ...del_ext/exllama_eora/test_actual_value.py b/...el_ext/exllama2-vllm/test_actual_value.py → ...del_ext/exllama_eora/test_actual_value.py
diff --git a/gptqmodel_ext/exllama2-vllm/test_eora.py → gptqmodel_ext/exllama_eora/test_eora.py b/gptqmodel_ext/exllama2-vllm/test_eora.py → gptqmodel_ext/exllama_eora/test_eora.py
diff --git a/...odel_ext/exllama2-vllm/test_eora_sweep.py → ...model_ext/exllama_eora/test_eora_sweep.py b/...odel_ext/exllama2-vllm/test_eora_sweep.py → ...model_ext/exllama_eora/test_eora_sweep.py
diff --git a/setup.py b/setup.py
@@ -215,8 +215,8 @@ def get_version_tag() -> str:
         cpp_ext.CUDAExtension(
             'gptqmodel_exllama_eora',
             [
-                "gptqmodel_ext/exllama2-vllm/eora/q_gemm.cu",
-                "gptqmodel_ext/exllama2-vllm/eora/pybind.cu",
+                "gptqmodel_ext/exllama_eora/eora/q_gemm.cu",
+                "gptqmodel_ext/exllama_eora/eora/pybind.cu",
             ],
             extra_link_args=extra_link_args,
             extra_compile_args=extra_compile_args,