Skip to content

Commit

Permalink
rename folder and cleanup namespace
Browse files Browse the repository at this point in the history
Signed-off-by: Qubitium <[email protected]>
  • Loading branch information
Qubitium committed Mar 2, 2025
1 parent 25f1607 commit 4684c2e
Show file tree
Hide file tree
Showing 22 changed files with 6 additions and 22 deletions.
2 changes: 1 addition & 1 deletion gptqmodel/nn_modules/qlinear/exllama_eora.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,


class ExllamaEoraQuantLinear(BaseQuantLinear):
SUPPORTS_BITS = [4, 8]
SUPPORTS_BITS = [4] # fused eora only validated for 4 bits
SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128]
SUPPORTS_DESC_ACT = [True, False]
SUPPORTS_SYM = [True] # TODO: validate False
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Copied from https://github.com/turboderp/exllamav2
#ifndef _compat_cuh
#define _compat_cuh

namespace vllm {
namespace gptq {
// atomicAdd for half types, to support CC < 7.x

Expand Down Expand Up @@ -60,5 +59,4 @@ __device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
#endif

} // namespace gptq
} // namespace vllm
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ https://github.com/turboderp/exllama

#include "qdq_util.cuh"

namespace vllm {
namespace gptq {

class MatrixView_half {
Expand Down Expand Up @@ -291,5 +290,4 @@ class MatrixView_q8_row {
};

} // namespace gptq
} // namespace vllm
#endif
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
#include "qdq_4.cuh"
#include "qdq_8.cuh"

namespace vllm {
namespace gptq {

#define BLOCK_KN_SIZE 128
Expand Down Expand Up @@ -2074,7 +2073,6 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
}

} // namespace gptq
} // namespace vllm

torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
torch::Tensor b_gptq_qzeros,
Expand All @@ -2086,7 +2084,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
at::Tensor temp_dq = torch::empty(
{b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);

vllm::gptq::gemm_half_q_half_cuda(
gptq::gemm_half_q_half_cuda(
at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
(const uint32_t*)b_q_weight.data_ptr(),
(const uint32_t*)b_gptq_qzeros.data_ptr(),
Expand All @@ -2112,7 +2110,7 @@ torch::Tensor gptq_gemm_lora(torch::Tensor a, torch::Tensor b_q_weight,
at::Tensor temp_dq = torch::empty(
{b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);

vllm::gptq::gemm_half_q_half_cuda_eora(
gptq::gemm_half_q_half_cuda_eora(
at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
(const uint32_t*)b_q_weight.data_ptr(),
(const uint32_t*)b_gptq_qzeros.data_ptr(),
Expand All @@ -2133,7 +2131,7 @@ torch::Tensor gptq_gemm_lora(torch::Tensor a, torch::Tensor b_q_weight,

void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
vllm::gptq::shuffle_exllama_weight(
gptq::shuffle_exllama_weight(
(uint32_t*)q_weight.data_ptr(),
q_perm.device().is_meta() || q_perm.numel() == 0
? NULL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ Copied from https://github.com/turboderp/exllamav2

#include "qdq_util.cuh"

namespace vllm {
namespace gptq {

// Permutation:
Expand Down Expand Up @@ -71,6 +70,5 @@ __forceinline__ __device__ void dequant_2bit_16(const uint32_t q_0,
}

} // namespace gptq
} // namespace vllm

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#include "qdq_util.cuh"

namespace vllm {
namespace gptq {
// Permutation:
//
Expand Down Expand Up @@ -144,6 +143,5 @@ __forceinline__ __device__ void dequant_3bit_32(const uint32_t q_0,
}

} // namespace gptq
} // namespace vllm

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ Copied from https://github.com/turboderp/exllamav2

#include "qdq_util.cuh"

namespace vllm {
namespace gptq {
// Permutation:
//
Expand Down Expand Up @@ -121,6 +120,5 @@ __forceinline__ __device__ void dequant_4bit_8_gptq(const uint32_t q_0,
}
}
} // namespace gptq
} // namespace vllm

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ Copied from https://github.com/turboderp/exllamav2

#include "qdq_util.cuh"

namespace vllm {
namespace gptq {

__forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
Expand All @@ -25,6 +24,5 @@ __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
}

} // namespace gptq
} // namespace vllm

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Copied from https://github.com/turboderp/exllamav2
#ifndef _qdq_util_cuh
#define _qdq_util_cuh

namespace vllm {
namespace gptq {

union half2_uint32 {
Expand Down Expand Up @@ -52,5 +51,4 @@ __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0,
}

} // namespace gptq
} // namespace vllm
#endif
File renamed without changes.
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,8 @@ def get_version_tag() -> str:
cpp_ext.CUDAExtension(
'gptqmodel_exllama_eora',
[
"gptqmodel_ext/exllama2-vllm/eora/q_gemm.cu",
"gptqmodel_ext/exllama2-vllm/eora/pybind.cu",
"gptqmodel_ext/exllama_eora/eora/q_gemm.cu",
"gptqmodel_ext/exllama_eora/eora/pybind.cu",
],
extra_link_args=extra_link_args,
extra_compile_args=extra_compile_args,
Expand Down

0 comments on commit 4684c2e

Please sign in to comment.