Skip to content

Commit

Permalink
Input normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
SONGDONGYUAN1994 committed Aug 13, 2023
1 parent f2a2b81 commit ff589b0
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
19 changes: 5 additions & 14 deletions scsampler/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
#!/usr/bin/env python
# coding: utf-8

# In[11]:


import numpy as np
import pandas as pd
import scanpy as sc
Expand All @@ -12,11 +9,9 @@
from numbers import Number
import warnings
from typing import Union, Optional, Tuple, Collection, Sequence, Iterable
import scipy
from scipy.spatial import distance
from scipy.sparse import issparse, isspmatrix_csr, csr_matrix, spmatrix
from uclab import uclab, uclab_split
import pyarrow as pa
from pyarrow import ChunkedArray
from sklearn.decomposition import TruncatedSVD

Expand All @@ -25,6 +20,7 @@ def scsampler(
fraction: Optional[float] = None,
n_obs: Optional[int] = None,
random_state: int = 0,
alpha: int = 50,
copy: bool = False,
obsm: Optional[str] = 'X_pca',
dr_num: Optional[int]=None,
Expand All @@ -44,6 +40,8 @@ def scsampler(
Subsample to this number of observations.
random_state
Random seed to change subsampling.
alpha
The power of distance in calculation. A large alpha will lead to better diversity, but too large will cause overflow.
copy
If an :class:`~anndata.AnnData` is passed,
determines whether a copy is returned.
Expand Down Expand Up @@ -112,7 +110,7 @@ def scsampler(

## Run uclab
split = 1 if random_split is None else random_split
obs_indices = uclab_split(X, new_n_obs, alpha = 4 * old_n_vars, drop_start=1, drop_rate=0, split=split)
obs_indices = uclab_split(X, new_n_obs, alpha = alpha, drop_start=1, drop_rate=0, split=split)

if isinstance(data, AnnData):
if copy:
Expand All @@ -128,11 +126,4 @@ def scsampler(
if copy:
return X[obs_indices], obs_indices
else:
return obs_indices


# In[ ]:




return obs_indices
14 changes: 13 additions & 1 deletion scsampler/uclab.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import numpy as np
from scipy.spatial import distance
from scipy.sparse import issparse, isspmatrix_csr, csr_matrix, spmatrix

from miniball import get_bounding_ball
from math import sqrt
import sys

def uclab(X, n, alpha, drop_start=1, drop_rate=0):
N = X.shape[0]
sample_index = np.full(n, -1, dtype=int)
Expand All @@ -10,6 +13,11 @@ def uclab(X, n, alpha, drop_start=1, drop_rate=0):
# Define function
#int64 = np.int64

# Normalize the input matrix by bound sphere of a small subset to avoid the overflow in distance calculation.
radius_index = np.random.randint(N, size=100)
Center_pos, r2 = get_bounding_ball(X[radius_index,:])
r = sqrt(r2)
X = X/r
# step 0
initial_index = np.random.randint(N, size=1)
sample_index[0] = initial_index
Expand All @@ -18,6 +26,10 @@ def uclab(X, n, alpha, drop_start=1, drop_rate=0):
d_index = np.argwhere(distances != -1).flatten()
distances[d_index] = np.power(1/distances[d_index], alpha)

if np.count_nonzero(distances[d_index]) < N-1:
sys.exit("The alpha is too large and distance calculation gets an overflow. Please decrease alpha or normalize your data.")


for i in np.arange(1,n):
#print(i, end=' ')
# drop large points
Expand Down

0 comments on commit ff589b0

Please sign in to comment.