Skip to main content

MiniMind: Lightweight and flexible AI generation library

Project description

MiniMind Example Code

from minimind import GPMGenerator, Sampler, SimpleTokenizer

def main():
    print("MiniMind GPMGenerator 테스트 시작!")

    # 샘플 데이터
    pairs = [
        ("안녕하세요", "안녕하세요 반갑습니다"),
        ("오늘 날씨 어때?", "오늘은 맑고 따뜻해요"),
        ("뭐 먹을래?", "저는 김치찌개 좋아해요"),
    ]

    # 샘플러 생성 (top-k 예시)
    sampler = Sampler(method='top_k', k=3)
    tokenizer = SimpleTokenizer()

    # 생성기 초기화 시 sampler 연결
    gpm = GPMGenerator(sampler=sampler, tokenizer=tokenizer)
    gpm.fit(pairs)

    # 생성 테스트
    prompt = "안녕하세요"
    response = gpm.chat(prompt, max_tokens=10)

    print("입력 프롬프트:", prompt)
    print("생성된 텍스트:", response)

if __name__ == "__main__":
    main()
from minimind import SAPGenerator
from minimind import SimpleTokenizer
def main():
    print("MiniMind SAPGenerator 테스트 시작!")

    # 간단한 데이터 샘플 (입력-출력 쌍)
    pairs = [
        ("안녕하세요", "안녕하세요"),
        ("오늘 날씨 어때?", "날씨가 좋아요"),
        ("밥 먹었어?", "네, 잘 먹었어요"),
        ("영화 볼래?", "좋아요 같이 보자"),
        ("잘 자요", "안녕히 주무세요"),
    ]

    # SAPGenerator 인스턴스 생성 및 학습
    tokenizer = SimpleTokenizer()
    sap_gen = SAPGenerator(tokenizer=tokenizer)
    sap_gen.fit(pairs)

    # 생성 테스트
    prompt = "오늘"
    print(f"입력: {prompt}")
    generated = sap_gen.chat(prompt, max_tokens=10)
    print(f"생성 결과: {generated}")

if __name__ == "__main__":
    main()
def main():
    print("MiniMind 패키지 실행 - 테스트 시작!")
    
    # 여기서 간단히 NeuralGenerator 테스트 예시 실행
    from minimind import NeuralGenerator
    from minimind import Sampler

    sampler = Sampler(method='temperature', temperature=0.8)
    
    # 더미 데이터 (토큰 인덱스 배열) 예시
    import numpy as np
    vocab_size = 100
    X_dummy = np.random.randint(0, vocab_size-1, size=(50, 10))  # 50샘플, 길이10 시퀀스
    y_dummy = np.zeros((50, vocab_size))
    for i in range(50):
        y_dummy[i, np.random.randint(0, vocab_size)] = 1.0  # 랜덤 원핫 출력
    
    ng = NeuralGenerator(vocab_size=vocab_size, epochs=3, verbose=True, sampler=sampler)
    ng.fit(X_dummy, y_dummy)
    
    prompt = np.array([1, 2, 3])  # 시작 토큰 시퀀스 예시
    generated_seq = ng.generate(prompt, max_tokens=10)
    print("생성된 시퀀스:", generated_seq)

if __name__ == "__main__":
    main()
# test_sampling.py

import numpy as np
from minimind import top_k_sampling, top_p_sampling, temperature_sampling, Sampler

def dummy_probs(size=100):
    probs = np.random.rand(size)
    return probs / probs.sum()

def test_sampling_functions():
    probs = dummy_probs()

    print("top_k_sampling:", top_k_sampling(probs, k=5))
    print("top_p_sampling:", top_p_sampling(probs, p=0.8))
    print("temperature_sampling (temp=0.5):", temperature_sampling(probs, temperature=0.5))
    print("temperature_sampling (temp=2.0):", temperature_sampling(probs, temperature=2.0))

def test_sampler_class():
    probs = dummy_probs()
    sampler = Sampler(method='top_p', p=0.9)
    print("Sampler top_p:", sampler.sample(probs))

    sampler.method = 'top_k'
    sampler.k = 3
    print("Sampler top_k:", sampler.sample(probs))

    sampler.method = 'temperature'
    sampler.temperature = 0.7
    print("Sampler temperature:", sampler.sample(probs))

if __name__ == "__main__":
    test_sampling_functions()
    test_sampler_class()
from minimind import SimpleTokenizer

tokenizer = SimpleTokenizer()

text = "Hello, 안녕하세요! Let's test the tokenizer 123."
tokens = tokenizer.tokenize(text)
print("토큰:", tokens)

reconstructed = tokenizer.detokenize(tokens)
print("복원된 문장:", reconstructed)
import os
import numpy as np
from minimind import set_seed, save_json, load_json, save_model_weights, load_model_weights, simple_logger


if __name__ == "__main__":
    # 테스트 함수들

    def test_set_seed():
        set_seed(123)
        a = np.random.rand(3)
        set_seed(123)
        b = np.random.rand(3)
        assert np.allclose(a, b), "set_seed 실패!"
        print("set_seed 테스트 통과!")

    def test_save_load_json():
        data = {'name': 'MiniMind', 'version': 1.0}
        filepath = 'test.json'
        save_json(data, filepath)
        loaded = load_json(filepath)
        assert data == loaded, "JSON 저장/로드 실패!"
        os.remove(filepath)
        print("save_json & load_json 테스트 통과!")

    def test_save_load_weights_multi_format():
        weights = {
            'W1': np.array([1, 2, 3]),
            'b1': np.array([0.1, 0.2, 0.3])
        }
        for fmt in ['npz', 'joblib', 'json']:
            filepath = f"weights_test.{fmt}"
            save_model_weights(weights, filepath, format=fmt)
            loaded = load_model_weights(filepath, format=fmt)
            for k in weights:
                assert np.allclose(weights[k], loaded[k]), f"{fmt} {k} 가중치 저장/로드 실패!"
            os.remove(filepath)
        print("멀티 포맷 가중치 저장/로드 테스트 통과!")

    def test_logger():
        simple_logger("테스트 로그 메시지")

    # 실행 테스트 모음
    test_set_seed()
    test_save_load_json()
    test_save_load_weights_multi_format()
    test_logger()
import numpy as np
from minimind import Radec # 네가 만든 클래스 파일명에 맞게 바꿔!
from minimind import Sampler

# 간단한 샘플용 토크나이저 (공백 기준)
def simple_tokenizer(text):
    return text.strip().split()

# 아주 단순 샘플 샘플러 (확률분포에서 랜덤 샘플링)

def main():

    import csv
    csv_path = "MLdata.csv"

    pairs = []
    with open(csv_path, encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            pairs.append((row['input_text'].strip(), row['output_text'].strip()))


    # 생성기 초기화
    generator = Radec(n_models=2, sampler=Sampler(), tokenizer=simple_tokenizer)

    # 학습
    print("학습 시작...")
    generator.fit(pairs[:200])
    print("학습 완료!")

    # 생성 테스트
    prompt = "오늘 날씨 어때?"
    print(f"'{prompt}'에 대한 생성 결과:")
    generated_tokens = generator.generate(prompt, max_tokens=10)
    print(" ".join(generated_tokens))

if __name__ == "__main__":
    main()
# minimind/seprod.py

import csv
import re
import numpy as np
import autograd.numpy as anp
from autograd import grad
from minimind import SeProD

# --- 토크나이저 (공백 단위) ---
def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

# --- vocab 빌드 ---
def build_vocab(tokens, min_freq=2):
    from collections import Counter
    counter = Counter(tokens)
    vocab = [w for w, c in counter.items() if c >= min_freq]
    vocab = sorted(vocab)
    stoi = {w: i for i, w in enumerate(vocab)}
    itos = {i: w for i, w in enumerate(vocab)}
    return stoi, itos

# --- 인코딩 ---
def encode(tokens, stoi):
    return [stoi[t] for t in tokens if t in stoi]

# --- 데이터셋 생성 (패딩 + max_len) ---
def pad_seq(seq, max_len, pad_idx):
    return seq[:max_len] + [pad_idx]*(max_len - len(seq))

def load_dataset(csv_path, max_len=20, min_freq=2, max_samples=1000):
    inputs = []
    outputs = []
    all_tokens = []

    with open(csv_path, encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            if i >= max_samples:
                break
            inp_tokens = simple_tokenizer(row['input_text'])
            out_tokens = simple_tokenizer(row['output_text']) + ["<EOS>"]
            all_tokens.extend(inp_tokens)
            all_tokens.extend(out_tokens)
            inputs.append(inp_tokens)
            outputs.append(out_tokens)

    stoi, itos = build_vocab(all_tokens, min_freq)
    pad_idx = len(stoi)  # 패딩 토큰은 vocab 끝에 추가

    X_enc = []
    X_dec = []
    Y = []

    for inp_tokens, out_tokens in zip(inputs, outputs):
        enc_encoded = encode(inp_tokens, stoi)
        dec_encoded = encode(out_tokens[:-1], stoi)  # 디코더 입력 (out_tokens - 마지막)
        y_encoded = encode(out_tokens[1:], stoi)      # 타깃 (out_tokens shifted)

        enc_padded = pad_seq(enc_encoded, max_len, pad_idx)
        dec_padded = pad_seq(dec_encoded, max_len, pad_idx)
        y_padded = pad_seq(y_encoded, max_len, pad_idx)

        X_enc.append(enc_padded)
        X_dec.append(dec_padded)
        Y.append(y_padded)

    vocab_size = len(stoi) + 1  # 패딩 포함
    return (np.array(X_enc), np.array(X_dec), np.array(Y), stoi, itos, pad_idx, vocab_size)



# --- 텍스트 생성 (단순 greedy) ---
def generate_text(model, stoi, itos, prompt, max_len=20, pad_idx=None):
    prompt_tokens = simple_tokenizer(prompt)
    enc_input = encode(prompt_tokens, stoi)
    enc_input = enc_input[:max_len]
    pad = pad_idx if pad_idx is not None else 0
    enc_input = enc_input + [pad]*(max_len - len(enc_input))
    enc_input = np.array([enc_input])

    generated = []

    # 디코더 입력 처음은 <BOS> 대신 빈 배열 or 패딩으로 시작
    dec_input = [pad] * max_len
    dec_input = np.array([dec_input])

    for _ in range(max_len):
        probs = model.predict(enc_input, dec_input)[0]  # (seq_len, vocab_size)
        next_token = np.argmax(probs[len(generated)])
        if next_token == pad:
            break
        generated.append(next_token)
        dec_input[0, len(generated)-1] = next_token

    return " ".join([itos.get(tok, "<UNK>") for tok in generated])

# --- 메인 실행 ---
if __name__ == "__main__":
    csv_path = "C:\\Users\\yuchan\\Code\\MLdata.csv"  # 경로 조정

    max_len = 20
    X_enc, X_dec, Y, stoi, itos, pad_idx, vocab_size = load_dataset(csv_path, max_len=max_len)

    model = SeProD(vocab_size=vocab_size, embed_dim=64, hidden_dim=128, max_len=max_len, pad_idx=pad_idx)

    model.fit(X_enc, X_dec, Y, epochs=10, batch_size=64, lr=0.001)

    prompt = "안녕하세요"
    generated_text = generate_text(model, stoi, itos, prompt, max_len=max_len, pad_idx=pad_idx)

    print("Generated Text:")
    print(generated_text)
from minimind import NeuralGenerator

model = NeuralGenerator(vocab_size=100, embed_dim=32, hidden_layer_sizes=(64, 32))
model.summary()  

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

minimind-0.1.2.tar.gz (16.7 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

minimind-0.1.2-py3-none-any.whl (18.3 kB view details)

Uploaded Python 3

File details

Details for the file minimind-0.1.2.tar.gz.

File metadata

  • Download URL: minimind-0.1.2.tar.gz
  • Upload date:
  • Size: 16.7 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.2

File hashes

Hashes for minimind-0.1.2.tar.gz
Algorithm Hash digest
SHA256 45af8f8fd7716a3a8a50bdf5954e5f8efb3b2108428e3a4f8d4255b860cca86c
MD5 c2ab1b5637fb6071a3dfb8c2f68e3041
BLAKE2b-256 0b6770b0eb462ab0d5f226003575d484fe375aa7757e28e65a7adba7d65baa33

See more details on using hashes here.

File details

Details for the file minimind-0.1.2-py3-none-any.whl.

File metadata

  • Download URL: minimind-0.1.2-py3-none-any.whl
  • Upload date:
  • Size: 18.3 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.2

File hashes

Hashes for minimind-0.1.2-py3-none-any.whl
Algorithm Hash digest
SHA256 c992636112c1e1883e3e25531e5716b4ac673bf0f2e6cccd14f9772bbad86eb3
MD5 e8d7b5f0b0758a55d0102dee3cc36dbb
BLAKE2b-256 b5d4607775c0db7b47051389f535d1332518af6de95ec0cdd9ee43e01215eafe

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page