Skip to main content

Python wrapper for damo, a set of fast and robust hash functions.

Project description

Damo-Embedding

Deploy to GitHub Pages Build and upload to PyPI

Quick Install

pip install damo-embedding

Example

Embedding

import damo
import torch
import numpy as np
from typing import Union
from collections import defaultdict


class Storage(object):
    """singleton storage class."""

    _instance = None

    def __new__(cls, *args, **kwargs):
        if cls._instance is None:
            cls._instance = object.__new__(cls)
            cls._instance.dir = kwargs.get("dir", "./embeddings")
            cls._instance.ttl = kwargs.get("ttl", 8640000)
            cls._instance.storage = damo.PyStorage(cls._instance.dir, cls._instance.ttl)
        return cls._instance

    @staticmethod
    def checkpoint(path: str):
        assert Storage._instance is not None
        Storage._instance.storage.checkpoint(path)

    @staticmethod
    def dump(path: str):
        assert Storage._instance is not None
        Storage._instance.storage.dump(path)

    @staticmethod
    def load_from_checkpoint(path: str):
        assert Storage._instance is not None
        Storage._instance.storage.load_from_checkpoint(path)


class Embedding(torch.nn.Module):
    _group = -1

    def __init__(self, dim: int, initializer={}, optimizer={}, group=-1, **kwargs):
        super(Embedding, self).__init__()
        self.dim = dim
        if group != -1:
            self.group = group
            assert 0 <= self.group < 256
        else:
            Embedding._group += 1
            self.group = Embedding._group
            assert 0 <= self.group < 256
        self.storage = Storage(**kwargs).storage

        # create initializer
        init_params = damo.Parameters()
        for k, v in initializer.items():
            init_params.insert(k, v)
        self.initializer = damo.PyInitializer(init_params)

        # create optimizer
        opt_params = damo.Parameters()
        for k, v in optimizer.items():
            opt_params.insert(k, v)
        self.optimizer = damo.PyOptimizer(opt_params)

        self.embedding = damo.PyEmbedding(
            self.storage, self.optimizer, self.initializer, self.dim, self.group
        )

    def forward(self, inputs: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
        """embedding lookup

        Args:
            inputs (Union[torch.Tensor, np.ndarray]): input values

        Returns:
            torch.Tensor: embedding values (inputs.shape[0], inputs.shape[1], self.dim)
        """

        data = inputs
        if isinstance(inputs, torch.Tensor):
            data = inputs.numpy().astype(np.uint64)
        elif isinstance(inputs, np.ndarray):
            if data.type != np.uint64:
                data = inputs.astype(np.uint64)

        batch_size, width = data.shape
        keys = np.unique(np.concatenate(data)).astype(np.uint64)
        length = keys.shape[0]
        weights = np.zeros(length * self.dim, dtype=np.float32)
        self.embedding.lookup(keys, weights)
        weights = weights.reshape((length, self.dim))
        weight_dict = {k: v for k, v in zip(keys, weights)}
        values = np.zeros(shape=(batch_size, width, self.dim), dtype=np.float32)

        for i in range(batch_size):
            for j in range(width):
                key = data[i][j]
                # 0 is padding value
                if key != 0:
                    values[i][j] = weight_dict[key]

        def apply_gradients(gradients):
            grad = gradients.numpy()
            grad = grad.reshape((batch_size, width, self.dim))
            grad_dict = defaultdict(lambda: np.zeros(self.dim, dtype=np.float32))
            for i in range(batch_size):
                for j in range(width):
                    key = data[i][j]
                    if key != 0:
                        grad_dict[key] += grad[i][j]

            values = np.zeros(length * self.dim, dtype=np.float32)
            for i in range(length):
                values[i * self.dim : (i + 1) * self.dim] = (
                    grad_dict[keys[i]] / batch_size
                )

            self.embedding.apply_gradients(keys, values)

        ret = torch.from_numpy(values)
        ret.requires_grad_()
        ret.register_hook(apply_gradients)
        return ret

DeepFM

import torch
import torch.nn as nn
import numpy as np
from typing import Union
from embedding import Embedding


class DeepFM(torch.nn.Module):
    def __init__(
        self,
        emb_size: int,
        fea_size: int,
        hid_dims=[256, 128],
        num_classes=1,
        dropout=[0.2, 0.2],
        **kwargs,
    ):
        super(DeepFM, self).__init__()
        self.emb_size = emb_size
        self.fea_size = fea_size

        initializer = {
            "name": "truncate_normal",
            "mean": float(kwargs.get("mean", 0.0)),
            "stddev": float(kwargs.get("stddev", 0.0001)),
        }

        optimizer = {
            "name": "adam",
            "gamma": float(kwargs.get("gamma", 0.001)),
            "beta1": float(kwargs.get("beta1", 0.9)),
            "beta2": float(kwargs.get("beta2", 0.999)),
            "lambda": float(kwargs.get("lambda", 0.0)),
            "epsilon": float(kwargs.get("epsilon", 1e-8)),
        }

        self.w = Embedding(
            1,
            initializer=initializer,
            optimizer=optimizer,
            group=0,
            **kwargs,
        )

        self.v = Embedding(
            self.emb_size,
            initializer=initializer,
            optimizer=optimizer,
            group=1,
            **kwargs,
        )
        self.w0 = torch.zeros(1, dtype=torch.float32, requires_grad=True)
        self.dims = [fea_size * emb_size] + hid_dims

        self.layers = nn.ModuleList()
        for i in range(1, len(self.dims)):
            self.layers.append(nn.Linear(self.dims[i - 1], self.dims[i]))
            self.layers.append(nn.BatchNorm1d(self.dims[i]))
            self.layers.append(nn.BatchNorm1d(self.dims[i]))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout[i - 1]))
        self.layers.append(nn.Linear(self.dims[-1], num_classes))
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
        """forward

        Args:
            inputs (Union[torch.Tensor, np.ndarray]): input tensor

        Returns:
            tensor.Tensor: deepfm forward values
        """
        assert inputs.shape[1] == self.fea_size
        w = self.w.forward(inputs)
        v = self.v.forward(inputs)
        square_of_sum = torch.pow(torch.sum(v, dim=1), 2)
        sum_of_square = torch.sum(v * v, dim=1)
        fm_out = (
            torch.sum((square_of_sum - sum_of_square) * 0.5, dim=1, keepdim=True)
            + torch.sum(w, dim=1)
            + self.w0
        )

        dnn_out = torch.flatten(v, 1)
        for layer in self.layers:
            dnn_out = layer(dnn_out)
        out = fm_out + dnn_out
        out = self.sigmoid(out)
        return out

Document

Doc Website

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

damo-embedding-1.0.4.tar.gz (86.7 kB view hashes)

Uploaded Source

Built Distributions

damo_embedding-1.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (59.5 MB view hashes)

Uploaded CPython 3.10 manylinux: glibc 2.17+ x86-64

damo_embedding-1.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (58.9 MB view hashes)

Uploaded CPython 3.10 manylinux: glibc 2.17+ ARM64

damo_embedding-1.0.4-cp310-cp310-macosx_11_0_arm64.whl (109.2 kB view hashes)

Uploaded CPython 3.10 macOS 11.0+ ARM64

damo_embedding-1.0.4-cp310-cp310-macosx_10_9_x86_64.whl (3.8 MB view hashes)

Uploaded CPython 3.10 macOS 10.9+ x86-64

damo_embedding-1.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (59.5 MB view hashes)

Uploaded CPython 3.9 manylinux: glibc 2.17+ x86-64

damo_embedding-1.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (58.9 MB view hashes)

Uploaded CPython 3.9 manylinux: glibc 2.17+ ARM64

damo_embedding-1.0.4-cp39-cp39-macosx_11_0_arm64.whl (109.2 kB view hashes)

Uploaded CPython 3.9 macOS 11.0+ ARM64

damo_embedding-1.0.4-cp39-cp39-macosx_10_9_x86_64.whl (3.8 MB view hashes)

Uploaded CPython 3.9 macOS 10.9+ x86-64

damo_embedding-1.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (59.5 MB view hashes)

Uploaded CPython 3.8 manylinux: glibc 2.17+ x86-64

damo_embedding-1.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (58.9 MB view hashes)

Uploaded CPython 3.8 manylinux: glibc 2.17+ ARM64

damo_embedding-1.0.4-cp38-cp38-macosx_11_0_arm64.whl (109.6 kB view hashes)

Uploaded CPython 3.8 macOS 11.0+ ARM64

damo_embedding-1.0.4-cp38-cp38-macosx_10_9_x86_64.whl (3.8 MB view hashes)

Uploaded CPython 3.8 macOS 10.9+ x86-64

damo_embedding-1.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (59.5 MB view hashes)

Uploaded CPython 3.7m manylinux: glibc 2.17+ x86-64

damo_embedding-1.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (58.9 MB view hashes)

Uploaded CPython 3.7m manylinux: glibc 2.17+ ARM64

damo_embedding-1.0.4-cp37-cp37m-macosx_10_9_x86_64.whl (3.8 MB view hashes)

Uploaded CPython 3.7m macOS 10.9+ x86-64

damo_embedding-1.0.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (59.5 MB view hashes)

Uploaded CPython 3.6m manylinux: glibc 2.17+ x86-64

damo_embedding-1.0.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (58.9 MB view hashes)

Uploaded CPython 3.6m manylinux: glibc 2.17+ ARM64

damo_embedding-1.0.4-cp36-cp36m-macosx_10_9_x86_64.whl (3.8 MB view hashes)

Uploaded CPython 3.6m macOS 10.9+ x86-64

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page