Hi. I predicted ATAC in a 128kib window using each model fold. They are in good concordance, apart from the most important one: ALL_FOLDS, which shows a bend with respect to all other folds.
Is it expected? Is it a bug? A feature of distillation process? (if ALL_FOLDS is indeed the distilled version)
Steps to reproduce:
tss = 178_807_423
ext = 131072 // 2 # 128k window size
window = Interval(chromosome="chr2", start=tss - ext, end=tss + ext)
lung = 'UBERON:2048'
output = {}
for fold in dna_client.ModelVersion:
print(f"Processing fold: {fold}")
dna_model_public = dna_client.create(api_key, model_version=fold )
output[fold] = dna_model_public.predict_interval(interval=window,
requested_outputs=[dna_client.OutputType.ATAC],
ontology_terms=[lung])
# Create a dataframe with ATAC values from all folds
atac_df = pd.DataFrame({fold: output[fold].atac.values.flatten() for fold in output.keys()})
# Create pairwise scatterplots for all folds vs all folds
folds = output.keys()
n_folds = len(folds)
fig, axes = plt.subplots(n_folds, n_folds, figsize=(15, 15))
for i, fold_y in enumerate(folds):
for j, fold_x in enumerate(folds):
ax = axes[i, j]
if i == j:
# Diagonal: histogram
ax.hist(atac_df[fold_x], bins=np.logspace(np.log10(atac_df[fold_x][atac_df[fold_x] > 0].min()), np.log10(atac_df[fold_x].max()), 50), alpha=0.7)
ax.set_xscale('log')
ax.set_title(fold_x)
else:
ax.scatter(atac_df[fold_x], atac_df[fold_y], alpha=0.3, s=1)
ax.set_xscale('log')
ax.set_yscale('log')
# Add diagonal line
lims = [max(ax.get_xlim()[0], ax.get_ylim()[0]), min(ax.get_xlim()[1], ax.get_ylim()[1])]
ax.plot(lims, lims, 'r--', alpha=0.5)
if j == 0:
ax.set_ylabel(fold_y)
if i == n_folds - 1:
ax.set_xlabel(fold_x)
plt.tight_layout()
plt.show()
