Skip to main content

MiniMind: Lightweight and flexible AI generation library

Project description

MiniMind Example Code

from minimind import GPMGenerator, Sampler, SimpleTokenizer

def main():
    print("MiniMind GPMGenerator 테스트 시작!")

    # 샘플 데이터
    pairs = [
        ("안녕하세요", "안녕하세요 반갑습니다"),
        ("오늘 날씨 어때?", "오늘은 맑고 따뜻해요"),
        ("뭐 먹을래?", "저는 김치찌개 좋아해요"),
    ]

    # 샘플러 생성 (top-k 예시)
    sampler = Sampler(method='top_k', k=3)
    tokenizer = SimpleTokenizer()

    # 생성기 초기화 시 sampler 연결
    gpm = GPMGenerator(sampler=sampler, tokenizer=tokenizer)
    gpm.fit(pairs)

    # 생성 테스트
    prompt = "안녕하세요"
    response = gpm.chat(prompt, max_tokens=10)

    print("입력 프롬프트:", prompt)
    print("생성된 텍스트:", response)

if __name__ == "__main__":
    main()
from minimind import SAPGenerator
from minimind import SimpleTokenizer
def main():
    print("MiniMind SAPGenerator 테스트 시작!")

    # 간단한 데이터 샘플 (입력-출력 쌍)
    pairs = [
        ("안녕하세요", "안녕하세요"),
        ("오늘 날씨 어때?", "날씨가 좋아요"),
        ("밥 먹었어?", "네, 잘 먹었어요"),
        ("영화 볼래?", "좋아요 같이 보자"),
        ("잘 자요", "안녕히 주무세요"),
    ]

    # SAPGenerator 인스턴스 생성 및 학습
    tokenizer = SimpleTokenizer()
    sap_gen = SAPGenerator(tokenizer=tokenizer)
    sap_gen.fit(pairs)

    # 생성 테스트
    prompt = "오늘"
    print(f"입력: {prompt}")
    generated = sap_gen.chat(prompt, max_tokens=10)
    print(f"생성 결과: {generated}")

if __name__ == "__main__":
    main()
def main():
    print("MiniMind 패키지 실행 - 테스트 시작!")
    
    # 여기서 간단히 NeuralGenerator 테스트 예시 실행
    from minimind import NeuralGenerator
    from minimind import Sampler

    sampler = Sampler(method='temperature', temperature=0.8)
    
    # 더미 데이터 (토큰 인덱스 배열) 예시
    import numpy as np
    vocab_size = 100
    X_dummy = np.random.randint(0, vocab_size-1, size=(50, 10))  # 50샘플, 길이10 시퀀스
    y_dummy = np.zeros((50, vocab_size))
    for i in range(50):
        y_dummy[i, np.random.randint(0, vocab_size)] = 1.0  # 랜덤 원핫 출력
    
    ng = NeuralGenerator(vocab_size=vocab_size, epochs=3, verbose=True, sampler=sampler)
    ng.fit(X_dummy, y_dummy)
    
    prompt = np.array([1, 2, 3])  # 시작 토큰 시퀀스 예시
    generated_seq = ng.generate(prompt, max_tokens=10)
    print("생성된 시퀀스:", generated_seq)

if __name__ == "__main__":
    main()
# test_sampling.py

import numpy as np
from minimind import top_k_sampling, top_p_sampling, temperature_sampling, Sampler

def dummy_probs(size=100):
    probs = np.random.rand(size)
    return probs / probs.sum()

def test_sampling_functions():
    probs = dummy_probs()

    print("top_k_sampling:", top_k_sampling(probs, k=5))
    print("top_p_sampling:", top_p_sampling(probs, p=0.8))
    print("temperature_sampling (temp=0.5):", temperature_sampling(probs, temperature=0.5))
    print("temperature_sampling (temp=2.0):", temperature_sampling(probs, temperature=2.0))

def test_sampler_class():
    probs = dummy_probs()
    sampler = Sampler(method='top_p', p=0.9)
    print("Sampler top_p:", sampler.sample(probs))

    sampler.method = 'top_k'
    sampler.k = 3
    print("Sampler top_k:", sampler.sample(probs))

    sampler.method = 'temperature'
    sampler.temperature = 0.7
    print("Sampler temperature:", sampler.sample(probs))

if __name__ == "__main__":
    test_sampling_functions()
    test_sampler_class()
from minimind import SimpleTokenizer

tokenizer = SimpleTokenizer()

text = "Hello, 안녕하세요! Let's test the tokenizer 123."
tokens = tokenizer.tokenize(text)
print("토큰:", tokens)

reconstructed = tokenizer.detokenize(tokens)
print("복원된 문장:", reconstructed)
import os
import numpy as np
from minimind import set_seed, save_json, load_json, save_model_weights, load_model_weights, simple_logger


if __name__ == "__main__":
    # 테스트 함수들

    def test_set_seed():
        set_seed(123)
        a = np.random.rand(3)
        set_seed(123)
        b = np.random.rand(3)
        assert np.allclose(a, b), "set_seed 실패!"
        print("set_seed 테스트 통과!")

    def test_save_load_json():
        data = {'name': 'MiniMind', 'version': 1.0}
        filepath = 'test.json'
        save_json(data, filepath)
        loaded = load_json(filepath)
        assert data == loaded, "JSON 저장/로드 실패!"
        os.remove(filepath)
        print("save_json & load_json 테스트 통과!")

    def test_save_load_weights_multi_format():
        weights = {
            'W1': np.array([1, 2, 3]),
            'b1': np.array([0.1, 0.2, 0.3])
        }
        for fmt in ['npz', 'joblib', 'json']:
            filepath = f"weights_test.{fmt}"
            save_model_weights(weights, filepath, format=fmt)
            loaded = load_model_weights(filepath, format=fmt)
            for k in weights:
                assert np.allclose(weights[k], loaded[k]), f"{fmt} {k} 가중치 저장/로드 실패!"
            os.remove(filepath)
        print("멀티 포맷 가중치 저장/로드 테스트 통과!")

    def test_logger():
        simple_logger("테스트 로그 메시지")

    # 실행 테스트 모음
    test_set_seed()
    test_save_load_json()
    test_save_load_weights_multi_format()
    test_logger()
import numpy as np
from minimind import Radec # 네가 만든 클래스 파일명에 맞게 바꿔!
from minimind import Sampler

# 간단한 샘플용 토크나이저 (공백 기준)
def simple_tokenizer(text):
    return text.strip().split()

# 아주 단순 샘플 샘플러 (확률분포에서 랜덤 샘플링)

def main():

    import csv
    csv_path = "MLdata.csv"

    pairs = []
    with open(csv_path, encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            pairs.append((row['input_text'].strip(), row['output_text'].strip()))


    # 생성기 초기화
    generator = Radec(n_models=2, sampler=Sampler(), tokenizer=simple_tokenizer)

    # 학습
    print("학습 시작...")
    generator.fit(pairs[:200])
    print("학습 완료!")

    # 생성 테스트
    prompt = "오늘 날씨 어때?"
    print(f"'{prompt}'에 대한 생성 결과:")
    generated_tokens = generator.generate(prompt, max_tokens=10)
    print(" ".join(generated_tokens))

if __name__ == "__main__":
    main()
# minimind/seprod.py

import csv
import re
import numpy as np
import autograd.numpy as anp
from autograd import grad
from minimind import SeProD

# --- 토크나이저 (공백 단위) ---
def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

# --- vocab 빌드 ---
def build_vocab(tokens, min_freq=2):
    from collections import Counter
    counter = Counter(tokens)
    vocab = [w for w, c in counter.items() if c >= min_freq]
    vocab = sorted(vocab)
    stoi = {w: i for i, w in enumerate(vocab)}
    itos = {i: w for i, w in enumerate(vocab)}
    return stoi, itos

# --- 인코딩 ---
def encode(tokens, stoi):
    return [stoi[t] for t in tokens if t in stoi]

# --- 데이터셋 생성 (패딩 + max_len) ---
def pad_seq(seq, max_len, pad_idx):
    return seq[:max_len] + [pad_idx]*(max_len - len(seq))

def load_dataset(csv_path, max_len=20, min_freq=2, max_samples=1000):
    inputs = []
    outputs = []
    all_tokens = []

    with open(csv_path, encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            if i >= max_samples:
                break
            inp_tokens = simple_tokenizer(row['input_text'])
            out_tokens = simple_tokenizer(row['output_text']) + ["<EOS>"]
            all_tokens.extend(inp_tokens)
            all_tokens.extend(out_tokens)
            inputs.append(inp_tokens)
            outputs.append(out_tokens)

    stoi, itos = build_vocab(all_tokens, min_freq)
    pad_idx = len(stoi)  # 패딩 토큰은 vocab 끝에 추가

    X_enc = []
    X_dec = []
    Y = []

    for inp_tokens, out_tokens in zip(inputs, outputs):
        enc_encoded = encode(inp_tokens, stoi)
        dec_encoded = encode(out_tokens[:-1], stoi)  # 디코더 입력 (out_tokens - 마지막)
        y_encoded = encode(out_tokens[1:], stoi)      # 타깃 (out_tokens shifted)

        enc_padded = pad_seq(enc_encoded, max_len, pad_idx)
        dec_padded = pad_seq(dec_encoded, max_len, pad_idx)
        y_padded = pad_seq(y_encoded, max_len, pad_idx)

        X_enc.append(enc_padded)
        X_dec.append(dec_padded)
        Y.append(y_padded)

    vocab_size = len(stoi) + 1  # 패딩 포함
    return (np.array(X_enc), np.array(X_dec), np.array(Y), stoi, itos, pad_idx, vocab_size)



# --- 텍스트 생성 (단순 greedy) ---
def generate_text(model, stoi, itos, prompt, max_len=20, pad_idx=None):
    prompt_tokens = simple_tokenizer(prompt)
    enc_input = encode(prompt_tokens, stoi)
    enc_input = enc_input[:max_len]
    pad = pad_idx if pad_idx is not None else 0
    enc_input = enc_input + [pad]*(max_len - len(enc_input))
    enc_input = np.array([enc_input])

    generated = []

    # 디코더 입력 처음은 <BOS> 대신 빈 배열 or 패딩으로 시작
    dec_input = [pad] * max_len
    dec_input = np.array([dec_input])

    for _ in range(max_len):
        probs = model.predict(enc_input, dec_input)[0]  # (seq_len, vocab_size)
        next_token = np.argmax(probs[len(generated)])
        if next_token == pad:
            break
        generated.append(next_token)
        dec_input[0, len(generated)-1] = next_token

    return " ".join([itos.get(tok, "<UNK>") for tok in generated])

# --- 메인 실행 ---
if __name__ == "__main__":
    csv_path = "C:\\Users\\yuchan\\Code\\MLdata.csv"  # 경로 조정

    max_len = 20
    X_enc, X_dec, Y, stoi, itos, pad_idx, vocab_size = load_dataset(csv_path, max_len=max_len)

    model = SeProD(vocab_size=vocab_size, embed_dim=64, hidden_dim=128, max_len=max_len, pad_idx=pad_idx)

    model.fit(X_enc, X_dec, Y, epochs=10, batch_size=64, lr=0.001)

    prompt = "안녕하세요"
    generated_text = generate_text(model, stoi, itos, prompt, max_len=max_len, pad_idx=pad_idx)

    print("Generated Text:")
    print(generated_text)
from minimind import NeuralGenerator

model = NeuralGenerator(vocab_size=100, embed_dim=32, hidden_layer_sizes=(64, 32))
model.summary()  

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

minimind-0.1.5.tar.gz (3.8 MB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

minimind-0.1.5-py3-none-any.whl (3.8 MB view details)

Uploaded Python 3

File details

Details for the file minimind-0.1.5.tar.gz.

File metadata

  • Download URL: minimind-0.1.5.tar.gz
  • Upload date:
  • Size: 3.8 MB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.2

File hashes

Hashes for minimind-0.1.5.tar.gz
Algorithm Hash digest
SHA256 3b9cd906dcbdd61c71c935a3a15c19d02c61313440845d00b99093c472d55a5a
MD5 eb99ba4195ef922f9c543e84662cb4f3
BLAKE2b-256 c44297da2ba194117544b71fe1e99e782806b5532488a755c9758fb5f1f54dcb

See more details on using hashes here.

File details

Details for the file minimind-0.1.5-py3-none-any.whl.

File metadata

  • Download URL: minimind-0.1.5-py3-none-any.whl
  • Upload date:
  • Size: 3.8 MB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.2

File hashes

Hashes for minimind-0.1.5-py3-none-any.whl
Algorithm Hash digest
SHA256 a08aa87601ddda6ccc5961873cc1559a654cd285c40d518fc1b2b26ecf02312d
MD5 d12ffb0e5abcc513830a568605aed495
BLAKE2b-256 8e24fcc4c2de2783e9de9b4fb7a8e5086cb3a3c9029022b0e7c5fe356903ae8c

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page