Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

retest: preparing to debug difference gke/ce #78

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions experiments/google/compute-engine/cpu/retest/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Retest of Compute Engine

> Size 32

We are trying to diagnose why there are differences in Compute Engine vs. GKE. So far we have identified two variables:

- MTU: on GKE was set to 8896 (and used the underlying default 1460 for Compute Engine)
- Tier 1 "PREMIUM" was set for GKE but not for Compute Engine
- COMPACT we were never able to get for compute engine, at least greater than 10 instances. I can try again but not sure it will be different.
44 changes: 44 additions & 0 deletions experiments/google/compute-engine/cpu/retest/base/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

module "compute_nodes" {
source = "./modules/compute"

for_each = {
for index, node in var.compute_node_specs:
node.name_prefix => node
}
project_id = var.project_id
region = var.region

family = var.family

name_prefix = each.value.name_prefix
subnetwork = var.subnetwork
machine_arch = each.value.machine_arch
machine_type = each.value.machine_type
num_instances = each.value.instances

boot_script = lookup(each.value, "boot_script", null)
compact_placement = lookup(each.value, "compact", false)
gpu = lookup(each.value, "gpu_type", null) == null || lookup(each.value, "gpu_count", 0) <= 0 ? null : {
type = each.value.gpu_type
count = each.value.gpu_count
}
service_account = {
email = var.service_account_emails["compute"]
scopes = var.compute_scopes
}
nfs_mounts = var.cluster_storage
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

data "google_compute_image" "fluxfw_compute_x86_64_image" {
project = var.project_id
family = var.family
}

data "google_compute_zones" "available" {
project = var.project_id
region = var.region
}

resource "google_compute_address" "ip_address" {
name = "external-ip"
}

locals {
automatic_restart = var.compact_placement ? false : var.automatic_restart
compute_images = {
"x86-64" = {
image = data.google_compute_image.fluxfw_compute_x86_64_image.self_link
project = data.google_compute_image.fluxfw_compute_x86_64_image.project
}
}
on_host_maintenance = var.compact_placement ? "TERMINATE" : var.on_host_maintenance
access_config = {
nat_ip = google_compute_address.ip_address.address
network_tier = var.network_tier
}
}

resource "google_compute_resource_policy" "collocated" {
count = var.compact_placement ? 1 : 0
name = "${var.name_prefix}-collocated-policy"
project = var.project_id
region = var.region
group_placement_policy {
vm_count = var.num_instances
collocation = "COLLOCATED"
}
}

module "flux_compute_instance_template" {
source = "github.com/terraform-google-modules/terraform-google-vm/modules/instance_template"
region = var.region
project_id = var.project_id
name_prefix = var.name_prefix
subnetwork = var.subnetwork
gpu = var.gpu
service_account = var.service_account
access_config = [local.access_config]
tags = ["ssh", "flux", "compute"]
machine_type = var.machine_type
disk_size_gb = 256
source_image = local.compute_images["${var.machine_arch}"].image
source_image_project = local.compute_images["${var.machine_arch}"].project
automatic_restart = local.automatic_restart
on_host_maintenance = local.on_host_maintenance
startup_script = var.boot_script

metadata = {
"enable-oslogin" : "TRUE",
"VmDnsSetting" : "GlobalDefault",
"nfs-mounts" : jsonencode(var.nfs_mounts),
"gpus-attached" : var.gpu != null ? "TRUE" : "FALSE"
}
}

module "flux_compute_instances" {
source = "github.com/terraform-google-modules/terraform-google-vm/modules/compute_instance"
region = var.region
zone = data.google_compute_zones.available.names[0]
hostname = var.name_prefix
add_hostname_suffix = true
num_instances = var.num_instances
resource_policies = var.compact_placement ? [ google_compute_resource_policy.collocated[0].self_link ] : []
instance_template = module.flux_compute_instance_template.self_link
subnetwork = var.subnetwork
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
variable "automatic_restart" {
type = bool
description = "(Optional) Specifies whether the instance should be automatically restarted if it is terminated by Compute Engine (not terminated by a user)."
default = true
}

variable "boot_script" {
description = "(Optional) the name of a file containing a script to be executed on compute nodes at boot time"
type = string
default = null
}

variable "compact_placement" {
description = "(Optional) a boolean which determines whether a set of compute nodes has a compact placement resource policy attached to them."
type = bool
default = false
}

variable "family" {
description = "The source X86 image family prefix to use"
type = string
default = "flux-framework"
}

variable "gpu" {
description = "The type and count of GPU(s) to attach to a compute node"
type = object({
type = string
count = number
})
default = null
}

variable "machine_arch" {
description = "The instruction set architecture, usually x86_64, used by the compute node"
type = string
}

variable "network_tier" {
description = "The network tier (STANDARD or PREMIUM)"
type = string
default = "STANDARD"
}

variable "machine_type" {
description = "The Compute Engine machine type to be used for the compute node"
type = string
}

variable "name_prefix" {
description = "The name prefix for the compute node instances, the full instances names will be this prefix followed by a node number"
type = string
}

variable "nfs_mounts" {
description = "A map with keys 'share' and 'mountpoint' describing an NFS export and its intended mount point"
type = map(string)
default = {}
}

variable "num_instances" {
description = "The number of compute node instances to create"
type = number
default = 1
}

variable "on_host_maintenance" {
type = string
description = "Instance availability Policy"
default = "MIGRATE"
}

variable "project_id" {
description = "The GCP project ID"
type = string
}

variable "region" {
description = "The GCP region where the cluster resides"
type = string
}

variable "service_account" {
description = "The GCP service account used by the compute node"
type = object({
email = string
scopes = set(string)
})
}

variable "subnetwork" {
description = "Subnetwork to deploy to"
type = string
}
53 changes: 53 additions & 0 deletions experiments/google/compute-engine/cpu/retest/base/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

variable "cluster_storage" {
description = "A map with keys 'share' and 'mountpoint' describing an NFS export and its intended mount point"
type = map(string)
}

variable "family" {
description = "The source image x86 prefix to be used by the compute node(s)"
type = string
default = "global"
}

variable "compute_node_specs" {
description = "A list of compute node specifications"
type = list(object({
name_prefix = string
machine_arch = string
machine_type = string
gpu_type = string
gpu_count = number
compact = bool
instances = number
properties = set(string)
boot_script = string
}))
default = []
}

variable "compute_scopes" {
description = "The set of access scopes for compute node instances"
default = [ "cloud-platform" ]
type = set(string)
}

variable "project_id" {
description = "The GCP project ID"
type = string
}

variable "region" {
description = "The GCP region where the cluster resides"
type = string
}

variable "service_account_emails" {
description = "A map with keys: 'compute', 'login', 'manager' that map to the service account to be used by the respective nodes"
type = map(string)
}

variable "subnetwork" {
description = "Subnetwork to deploy to"
type = string
}
Loading