Skip to main content

MiniMind: Lightweight and flexible AI generation library

Project description

MiniMind Example Code

from minimind import GPMGenerator, Sampler, SimpleTokenizer

def main():
    print("MiniMind GPMGenerator 테스트 시작!")

    # 샘플 데이터
    pairs = [
        ("안녕하세요", "안녕하세요 반갑습니다"),
        ("오늘 날씨 어때?", "오늘은 맑고 따뜻해요"),
        ("뭐 먹을래?", "저는 김치찌개 좋아해요"),
    ]

    # 샘플러 생성 (top-k 예시)
    sampler = Sampler(method='top_k', k=3)
    tokenizer = SimpleTokenizer()

    # 생성기 초기화 시 sampler 연결
    gpm = GPMGenerator(sampler=sampler, tokenizer=tokenizer)
    gpm.fit(pairs)

    # 생성 테스트
    prompt = "안녕하세요"
    response = gpm.chat(prompt, max_tokens=10)

    print("입력 프롬프트:", prompt)
    print("생성된 텍스트:", response)

if __name__ == "__main__":
    main()
from minimind import SAPGenerator
from minimind import SimpleTokenizer
def main():
    print("MiniMind SAPGenerator 테스트 시작!")

    # 간단한 데이터 샘플 (입력-출력 쌍)
    pairs = [
        ("안녕하세요", "안녕하세요"),
        ("오늘 날씨 어때?", "날씨가 좋아요"),
        ("밥 먹었어?", "네, 잘 먹었어요"),
        ("영화 볼래?", "좋아요 같이 보자"),
        ("잘 자요", "안녕히 주무세요"),
    ]

    # SAPGenerator 인스턴스 생성 및 학습
    tokenizer = SimpleTokenizer()
    sap_gen = SAPGenerator(tokenizer=tokenizer)
    sap_gen.fit(pairs)

    # 생성 테스트
    prompt = "오늘"
    print(f"입력: {prompt}")
    generated = sap_gen.chat(prompt, max_tokens=10)
    print(f"생성 결과: {generated}")

if __name__ == "__main__":
    main()
def main():
    print("MiniMind 패키지 실행 - 테스트 시작!")
    
    # 여기서 간단히 NeuralGenerator 테스트 예시 실행
    from minimind import NeuralGenerator
    from minimind import Sampler

    sampler = Sampler(method='temperature', temperature=0.8)
    
    # 더미 데이터 (토큰 인덱스 배열) 예시
    import numpy as np
    vocab_size = 100
    X_dummy = np.random.randint(0, vocab_size-1, size=(50, 10))  # 50샘플, 길이10 시퀀스
    y_dummy = np.zeros((50, vocab_size))
    for i in range(50):
        y_dummy[i, np.random.randint(0, vocab_size)] = 1.0  # 랜덤 원핫 출력
    
    ng = NeuralGenerator(vocab_size=vocab_size, epochs=3, verbose=True, sampler=sampler)
    ng.fit(X_dummy, y_dummy)
    
    prompt = np.array([1, 2, 3])  # 시작 토큰 시퀀스 예시
    generated_seq = ng.generate(prompt, max_tokens=10)
    print("생성된 시퀀스:", generated_seq)

if __name__ == "__main__":
    main()
# test_sampling.py

import numpy as np
from minimind import top_k_sampling, top_p_sampling, temperature_sampling, Sampler

def dummy_probs(size=100):
    probs = np.random.rand(size)
    return probs / probs.sum()

def test_sampling_functions():
    probs = dummy_probs()

    print("top_k_sampling:", top_k_sampling(probs, k=5))
    print("top_p_sampling:", top_p_sampling(probs, p=0.8))
    print("temperature_sampling (temp=0.5):", temperature_sampling(probs, temperature=0.5))
    print("temperature_sampling (temp=2.0):", temperature_sampling(probs, temperature=2.0))

def test_sampler_class():
    probs = dummy_probs()
    sampler = Sampler(method='top_p', p=0.9)
    print("Sampler top_p:", sampler.sample(probs))

    sampler.method = 'top_k'
    sampler.k = 3
    print("Sampler top_k:", sampler.sample(probs))

    sampler.method = 'temperature'
    sampler.temperature = 0.7
    print("Sampler temperature:", sampler.sample(probs))

if __name__ == "__main__":
    test_sampling_functions()
    test_sampler_class()
from minimind import SimpleTokenizer

tokenizer = SimpleTokenizer()

text = "Hello, 안녕하세요! Let's test the tokenizer 123."
tokens = tokenizer.tokenize(text)
print("토큰:", tokens)

reconstructed = tokenizer.detokenize(tokens)
print("복원된 문장:", reconstructed)
import os
import numpy as np
from minimind import set_seed, save_json, load_json, save_model_weights, load_model_weights, simple_logger


if __name__ == "__main__":
    # 테스트 함수들

    def test_set_seed():
        set_seed(123)
        a = np.random.rand(3)
        set_seed(123)
        b = np.random.rand(3)
        assert np.allclose(a, b), "set_seed 실패!"
        print("set_seed 테스트 통과!")

    def test_save_load_json():
        data = {'name': 'MiniMind', 'version': 1.0}
        filepath = 'test.json'
        save_json(data, filepath)
        loaded = load_json(filepath)
        assert data == loaded, "JSON 저장/로드 실패!"
        os.remove(filepath)
        print("save_json & load_json 테스트 통과!")

    def test_save_load_weights_multi_format():
        weights = {
            'W1': np.array([1, 2, 3]),
            'b1': np.array([0.1, 0.2, 0.3])
        }
        for fmt in ['npz', 'joblib', 'json']:
            filepath = f"weights_test.{fmt}"
            save_model_weights(weights, filepath, format=fmt)
            loaded = load_model_weights(filepath, format=fmt)
            for k in weights:
                assert np.allclose(weights[k], loaded[k]), f"{fmt} {k} 가중치 저장/로드 실패!"
            os.remove(filepath)
        print("멀티 포맷 가중치 저장/로드 테스트 통과!")

    def test_logger():
        simple_logger("테스트 로그 메시지")

    # 실행 테스트 모음
    test_set_seed()
    test_save_load_json()
    test_save_load_weights_multi_format()
    test_logger()
import numpy as np
from minimind import Radec # 네가 만든 클래스 파일명에 맞게 바꿔!
from minimind import Sampler

# 간단한 샘플용 토크나이저 (공백 기준)
def simple_tokenizer(text):
    return text.strip().split()

# 아주 단순 샘플 샘플러 (확률분포에서 랜덤 샘플링)

def main():

    import csv
    csv_path = "MLdata.csv"

    pairs = []
    with open(csv_path, encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            pairs.append((row['input_text'].strip(), row['output_text'].strip()))


    # 생성기 초기화
    generator = Radec(n_models=2, sampler=Sampler(), tokenizer=simple_tokenizer)

    # 학습
    print("학습 시작...")
    generator.fit(pairs[:200])
    print("학습 완료!")

    # 생성 테스트
    prompt = "오늘 날씨 어때?"
    print(f"'{prompt}'에 대한 생성 결과:")
    generated_tokens = generator.generate(prompt, max_tokens=10)
    print(" ".join(generated_tokens))

if __name__ == "__main__":
    main()
# minimind/seprod.py

import csv
import re
import numpy as np
import autograd.numpy as anp
from autograd import grad
from minimind import SeProD

# --- 토크나이저 (공백 단위) ---
def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

# --- vocab 빌드 ---
def build_vocab(tokens, min_freq=2):
    from collections import Counter
    counter = Counter(tokens)
    vocab = [w for w, c in counter.items() if c >= min_freq]
    vocab = sorted(vocab)
    stoi = {w: i for i, w in enumerate(vocab)}
    itos = {i: w for i, w in enumerate(vocab)}
    return stoi, itos

# --- 인코딩 ---
def encode(tokens, stoi):
    return [stoi[t] for t in tokens if t in stoi]

# --- 데이터셋 생성 (패딩 + max_len) ---
def pad_seq(seq, max_len, pad_idx):
    return seq[:max_len] + [pad_idx]*(max_len - len(seq))

def load_dataset(csv_path, max_len=20, min_freq=2, max_samples=1000):
    inputs = []
    outputs = []
    all_tokens = []

    with open(csv_path, encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            if i >= max_samples:
                break
            inp_tokens = simple_tokenizer(row['input_text'])
            out_tokens = simple_tokenizer(row['output_text']) + ["<EOS>"]
            all_tokens.extend(inp_tokens)
            all_tokens.extend(out_tokens)
            inputs.append(inp_tokens)
            outputs.append(out_tokens)

    stoi, itos = build_vocab(all_tokens, min_freq)
    pad_idx = len(stoi)  # 패딩 토큰은 vocab 끝에 추가

    X_enc = []
    X_dec = []
    Y = []

    for inp_tokens, out_tokens in zip(inputs, outputs):
        enc_encoded = encode(inp_tokens, stoi)
        dec_encoded = encode(out_tokens[:-1], stoi)  # 디코더 입력 (out_tokens - 마지막)
        y_encoded = encode(out_tokens[1:], stoi)      # 타깃 (out_tokens shifted)

        enc_padded = pad_seq(enc_encoded, max_len, pad_idx)
        dec_padded = pad_seq(dec_encoded, max_len, pad_idx)
        y_padded = pad_seq(y_encoded, max_len, pad_idx)

        X_enc.append(enc_padded)
        X_dec.append(dec_padded)
        Y.append(y_padded)

    vocab_size = len(stoi) + 1  # 패딩 포함
    return (np.array(X_enc), np.array(X_dec), np.array(Y), stoi, itos, pad_idx, vocab_size)



# --- 텍스트 생성 (단순 greedy) ---
def generate_text(model, stoi, itos, prompt, max_len=20, pad_idx=None):
    prompt_tokens = simple_tokenizer(prompt)
    enc_input = encode(prompt_tokens, stoi)
    enc_input = enc_input[:max_len]
    pad = pad_idx if pad_idx is not None else 0
    enc_input = enc_input + [pad]*(max_len - len(enc_input))
    enc_input = np.array([enc_input])

    generated = []

    # 디코더 입력 처음은 <BOS> 대신 빈 배열 or 패딩으로 시작
    dec_input = [pad] * max_len
    dec_input = np.array([dec_input])

    for _ in range(max_len):
        probs = model.predict(enc_input, dec_input)[0]  # (seq_len, vocab_size)
        next_token = np.argmax(probs[len(generated)])
        if next_token == pad:
            break
        generated.append(next_token)
        dec_input[0, len(generated)-1] = next_token

    return " ".join([itos.get(tok, "<UNK>") for tok in generated])

# --- 메인 실행 ---
if __name__ == "__main__":
    csv_path = "C:\\Users\\yuchan\\Code\\MLdata.csv"  # 경로 조정

    max_len = 20
    X_enc, X_dec, Y, stoi, itos, pad_idx, vocab_size = load_dataset(csv_path, max_len=max_len)

    model = SeProD(vocab_size=vocab_size, embed_dim=64, hidden_dim=128, max_len=max_len, pad_idx=pad_idx)

    model.fit(X_enc, X_dec, Y, epochs=10, batch_size=64, lr=0.001)

    prompt = "안녕하세요"
    generated_text = generate_text(model, stoi, itos, prompt, max_len=max_len, pad_idx=pad_idx)

    print("Generated Text:")
    print(generated_text)
from minimind import NeuralGenerator

model = NeuralGenerator(vocab_size=100, embed_dim=32, hidden_layer_sizes=(64, 32))
model.summary()  

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

minimind-0.2.1.tar.gz (3.8 MB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

minimind-0.2.1-py3-none-any.whl (3.8 MB view details)

Uploaded Python 3

File details

Details for the file minimind-0.2.1.tar.gz.

File metadata

  • Download URL: minimind-0.2.1.tar.gz
  • Upload date:
  • Size: 3.8 MB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.2

File hashes

Hashes for minimind-0.2.1.tar.gz
Algorithm Hash digest
SHA256 13cb8dde5c137697daf55052ad1dd28a15ab56853409dc8ab7d6fd2f400e9061
MD5 71001efeb7f6abc55d5c98afb4389db7
BLAKE2b-256 28c2975505167705f12e2b7ff21098215243fbbbd3a203bc2e3c34a4fc32aaca

See more details on using hashes here.

File details

Details for the file minimind-0.2.1-py3-none-any.whl.

File metadata

  • Download URL: minimind-0.2.1-py3-none-any.whl
  • Upload date:
  • Size: 3.8 MB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.2

File hashes

Hashes for minimind-0.2.1-py3-none-any.whl
Algorithm Hash digest
SHA256 76f00dae3981c4b3a2db4ac6058a614ffef87b5b48bed05bd4aa2eb98a3fd858
MD5 0fddfa136730245046e9f76bce8f7eca
BLAKE2b-256 07ef024c9f049f2f644a56ed960e34b0f2ced0fe05a2d488997a53a1725e2957

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page