Tutorial 21: Data Sources and External Data
Learning Objectives
By the end of this tutorial, you will be able to:
- Use data sources to query existing infrastructure and external APIs
- Implement external data sources for custom integrations
- Handle data transformation and filtering with data sources
- Design dynamic configurations based on external data
- Apply security best practices when working with external data
Prerequisites
- Completed Tutorial 7: Query Data Sources
- Understanding of Terraform providers and resources
- Basic knowledge of APIs and data formats (JSON, YAML)
Introduction
Data sources allow Terraform to fetch information from external systems, existing infrastructure, and APIs. They enable dynamic configuration generation based on real-time data, making your infrastructure code more flexible and responsive to changing conditions.
Common Data Sources
AWS Data Sources
# Query existing VPC
data "aws_vpc" "existing" {
filter {
name = "tag:Name"
values = ["production-vpc"]
}
}
# Get latest AMI
data "aws_ami" "ubuntu" {
most_recent = true
owners = ["099720109477"] # Canonical
filter {
name = "name"
values = ["ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*"]
}
filter {
name = "virtualization-type"
values = ["hvm"]
}
}
# Query availability zones
data "aws_availability_zones" "available" {
state = "available"
filter {
name = "zone-type"
values = ["availability-zone"]
}
}
# Get current region and caller identity
data "aws_region" "current" {}
data "aws_caller_identity" "current" {}
# Query existing subnets
data "aws_subnets" "private" {
filter {
name = "vpc-id"
values = [data.aws_vpc.existing.id]
}
filter {
name = "tag:Type"
values = ["private"]
}
}
data "aws_subnet" "private_details" {
for_each = toset(data.aws_subnets.private.ids)
id = each.value
}
# Query Route 53 hosted zone
data "aws_route53_zone" "main" {
name = var.domain_name
private_zone = false
}
# Get existing security groups
data "aws_security_groups" "database" {
filter {
name = "group-name"
values = ["*database*"]
}
filter {
name = "vpc-id"
values = [data.aws_vpc.existing.id]
}
}
# Using data sources in resources
resource "aws_instance" "app" {
count = var.instance_count
ami = data.aws_ami.ubuntu.id
instance_type = var.instance_type
subnet_id = data.aws_subnets.private.ids[count.index % length(data.aws_subnets.private.ids)]
vpc_security_group_ids = data.aws_security_groups.database.ids
availability_zone = data.aws_availability_zones.available.names[count.index % length(data.aws_availability_zones.available.names)]
tags = {
Name = "${var.project_name}-app-${count.index + 1}"
Region = data.aws_region.current.name
Owner = data.aws_caller_identity.current.user_id
}
}
Multi-Provider Data Sources
# Kubernetes data sources
data "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
}
}
data "kubernetes_service" "prometheus" {
metadata {
name = "prometheus-server"
namespace = data.kubernetes_namespace.monitoring.metadata[0].name
}
}
# GitHub data sources
data "github_repository" "app" {
full_name = "${var.github_org}/${var.repository_name}"
}
data "github_team" "developers" {
slug = "developers"
}
# Azure data sources
data "azurerm_client_config" "current" {}
data "azurerm_resource_group" "existing" {
name = var.resource_group_name
}
data "azurerm_virtual_network" "existing" {
name = var.vnet_name
resource_group_name = data.azurerm_resource_group.existing.name
}
# Google Cloud data sources
data "google_project" "current" {}
data "google_compute_zones" "available" {
region = var.region
status = "UP"
}
data "google_container_engine_versions" "gke" {
location = var.region
}
External Data Sources
HTTP Data Source
# Query external API for configuration
data "http" "service_discovery" {
url = "https://api.example.com/v1/services"
request_headers = {
Accept = "application/json"
Authorization = "Bearer ${var.api_token}"
User-Agent = "Terraform/${var.terraform_version}"
}
retry {
attempts = 3
max_delay_ms = 1000
}
}
# Parse and use the API response
locals {
services = jsondecode(data.http.service_discovery.response_body)
# Transform external data for Terraform use
service_configs = {
for service in local.services.items :
service.name => {
port = service.port
protocol = service.protocol
health_path = service.health_check.path
replicas = service.scaling.default_replicas
}
}
}
# Create load balancer target groups based on external service data
resource "aws_lb_target_group" "services" {
for_each = local.service_configs
name = "${var.project_name}-${each.key}-tg"
port = each.value.port
protocol = upper(each.value.protocol)
vpc_id = data.aws_vpc.existing.id
health_check {
enabled = true
healthy_threshold = 2
unhealthy_threshold = 2
timeout = 5
interval = 30
path = each.value.health_path
matcher = "200"
}
tags = {
Name = "${var.project_name}-${each.key}"
Service = each.key
Source = "external-api"
}
}
# Query configuration from remote YAML/JSON files
data "http" "app_config" {
url = "https://raw.githubusercontent.com/${var.github_org}/${var.config_repo}/main/config/${var.environment}.yaml"
request_headers = {
Accept = "application/yaml"
}
}
locals {
# Parse YAML configuration
app_configuration = yamldecode(data.http.app_config.response_body)
}
External Data Source with Scripts
# Custom script to fetch dynamic data
data "external" "instance_metadata" {
program = ["python3", "${path.module}/scripts/get_instance_data.py"]
query = {
region = var.aws_region
environment = var.environment
filter_tag = var.instance_filter_tag
}
}
# Python script example: scripts/get_instance_data.py
# #!/usr/bin/env python3
# import json
# import sys
# import boto3
#
# def main():
# # Read query from stdin
# query = json.load(sys.stdin)
#
# # Initialize AWS client
# ec2 = boto3.client('ec2', region_name=query['region'])
#
# # Query instances
# response = ec2.describe_instances(
# Filters=[
# {'Name': 'tag:Environment', 'Values': [query['environment']]},
# {'Name': 'instance-state-name', 'Values': ['running']}
# ]
# )
#
# instances = []
# for reservation in response['Reservations']:
# for instance in reservation['Instances']:
# instances.append({
# 'id': instance['InstanceId'],
# 'type': instance['InstanceType'],
# 'az': instance['Placement']['AvailabilityZone'],
# 'private_ip': instance['PrivateIpAddress']
# })
#
# # Return JSON result
# result = {
# 'instances': json.dumps(instances),
# 'count': str(len(instances))
# }
#
# print(json.dumps(result))
#
# if __name__ == '__main__':
# main()
locals {
# Parse external script results
existing_instances = jsondecode(data.external.instance_metadata.result.instances)
instance_count = tonumber(data.external.instance_metadata.result.count)
# Create mapping of existing instances by AZ
instances_by_az = {
for instance in local.existing_instances :
instance.az => instance...
}
}
# Database discovery script
data "external" "database_endpoints" {
program = ["bash", "${path.module}/scripts/discover_databases.sh"]
query = {
environment = var.environment
vpc_id = data.aws_vpc.existing.id
}
}
# Bash script example: scripts/discover_databases.sh
# #!/bin/bash
# set -e
#
# # Parse input JSON
# eval "$(jq -r '@sh "ENVIRONMENT=\(.environment) VPC_ID=\(.vpc_id)"')"
#
# # Query RDS instances
# INSTANCES=$(aws rds describe-db-instances \
# --query "DBInstances[?DBSubnetGroup.VpcId=='$VPC_ID' && contains(TagList[?Key=='Environment'].Value, '$ENVIRONMENT')].[DBInstanceIdentifier,Endpoint.Address,Endpoint.Port,Engine]" \
# --output json)
#
# # Format result
# jq -n \
# --argjson instances "$INSTANCES" \
# '{
# databases: ($instances | map({
# identifier: .[0],
# endpoint: .[1],
# port: .[2],
# engine: .[3]
# }) | tostring),
# count: ($instances | length | tostring)
# }'
locals {
database_info = jsondecode(data.external.database_endpoints.result.databases)
}
Template Data Source
# Cloud-init user data template
data "template_file" "user_data" {
template = file("${path.module}/templates/user_data.yaml")
vars = {
hostname = "${var.project_name}-${var.environment}"
app_version = var.app_version
database_url = local.database_info[0].endpoint
api_key = var.api_key
environment = var.environment
log_level = var.environment == "prod" ? "INFO" : "DEBUG"
cluster_name = aws_ecs_cluster.main.name
}
}
# Kubernetes manifest template
data "template_file" "deployment_manifest" {
template = file("${path.module}/templates/deployment.yaml")
vars = {
app_name = var.application_name
image_tag = var.image_tag
replica_count = var.environment == "prod" ? 3 : 1
cpu_request = var.cpu_request
memory_request = var.memory_request
cpu_limit = var.cpu_limit
memory_limit = var.memory_limit
namespace = var.namespace
config_map_name = kubernetes_config_map.app.metadata[0].name
secret_name = kubernetes_secret.app.metadata[0].name
}
}
# Use templated data
resource "aws_instance" "app" {
count = var.instance_count
ami = data.aws_ami.ubuntu.id
instance_type = var.instance_type
user_data = data.template_file.user_data.rendered
tags = {
Name = "${var.project_name}-app-${count.index + 1}"
}
}
resource "kubernetes_manifest" "app_deployment" {
manifest = yamldecode(data.template_file.deployment_manifest.rendered)
}
Advanced Data Source Patterns
Dynamic Multi-Environment Data Fetching
# variables.tf
variable "environments" {
description = "List of environments to query"
type = list(string)
default = ["dev", "staging", "prod"]
}
variable "service_discovery_endpoints" {
description = "Service discovery endpoints per environment"
type = map(string)
default = {
dev = "https://dev-api.example.com/services"
staging = "https://staging-api.example.com/services"
prod = "https://api.example.com/services"
}
}
# Query multiple environments
data "http" "environment_services" {
for_each = toset(var.environments)
url = var.service_discovery_endpoints[each.key]
request_headers = {
Accept = "application/json"
Authorization = "Bearer ${var.api_tokens[each.key]}"
Environment = each.key
}
}
locals {
# Parse all environment data
all_environment_services = {
for env in var.environments :
env => jsondecode(data.http.environment_services[env].response_body)
}
# Aggregate services across environments
unique_services = distinct(flatten([
for env, services in local.all_environment_services :
[for service in services.items : service.name]
]))
# Create environment-service matrix
service_environments = flatten([
for env, services in local.all_environment_services : [
for service in services.items : {
environment = env
service = service.name
config = service
key = "${service.name}-${env}"
}
]
])
}
# Create resources based on aggregated data
resource "aws_lb_target_group" "cross_environment" {
for_each = {
for item in local.service_environments :
item.key => item
}
name = "${each.value.service}-${each.value.environment}"
port = each.value.config.port
protocol = "HTTP"
vpc_id = data.aws_vpc.existing.id
tags = {
Service = each.value.service
Environment = each.value.environment
Source = "external-discovery"
}
}
Data Source Chaining and Dependencies
# First, get VPC information
data "aws_vpc" "main" {
filter {
name = "tag:Environment"
values = [var.environment]
}
filter {
name = "tag:Project"
values = [var.project_name]
}
}
# Then get subnets based on VPC
data "aws_subnets" "app_subnets" {
filter {
name = "vpc-id"
values = [data.aws_vpc.main.id]
}
filter {
name = "tag:Tier"
values = ["application"]
}
}
# Get security groups based on VPC
data "aws_security_groups" "app_sgs" {
filter {
name = "vpc-id"
values = [data.aws_vpc.main.id]
}
filter {
name = "tag:Application"
values = [var.application_name]
}
}
# Query external service for configuration based on infrastructure
data "http" "app_config" {
url = "https://config-api.example.com/config"
request_headers = {
Accept = "application/json"
VPC-ID = data.aws_vpc.main.id
Subnets = join(",", data.aws_subnets.app_subnets.ids)
Region = data.aws_region.current.name
}
# Ensure VPC data is available before making API call
depends_on = [data.aws_vpc.main]
}
locals {
# Parse configuration with fallbacks
external_config = jsondecode(data.http.app_config.response_body)
# Merge external config with local defaults
app_config = merge({
instance_type = "t3.micro"
min_instances = 1
max_instances = 3
}, local.external_config.infrastructure)
# Select subnets based on external recommendations
selected_subnets = length(local.external_config.recommended_subnets) > 0 ?
local.external_config.recommended_subnets :
data.aws_subnets.app_subnets.ids
}
# Use chained data sources in resources
resource "aws_launch_template" "app" {
name_prefix = "${var.application_name}-"
image_id = data.aws_ami.ubuntu.id
instance_type = local.app_config.instance_type
vpc_security_group_ids = data.aws_security_groups.app_sgs.ids
user_data = base64encode(templatefile("${path.module}/user_data.sh", {
app_config = local.app_config
vpc_config = {
vpc_id = data.aws_vpc.main.id
vpc_cidr = data.aws_vpc.main.cidr_block
subnet_ids = local.selected_subnets
}
}))
}
Conditional Data Source Usage
# variables.tf
variable "use_existing_vpc" {
description = "Use existing VPC instead of creating new one"
type = bool
default = false
}
variable "existing_vpc_name" {
description = "Name of existing VPC to use"
type = string
default = ""
}
variable "use_external_config" {
description = "Fetch configuration from external API"
type = bool
default = false
}
# Conditional data sources
data "aws_vpc" "existing" {
count = var.use_existing_vpc ? 1 : 0
filter {
name = "tag:Name"
values = [var.existing_vpc_name]
}
}
data "http" "external_config" {
count = var.use_external_config ? 1 : 0
url = "https://config-api.example.com/infrastructure/${var.environment}"
request_headers = {
Accept = "application/json"
Authorization = "Bearer ${var.config_api_token}"
}
}
# Local values with conditional logic
locals {
# Use existing VPC or create new one
vpc_id = var.use_existing_vpc ? data.aws_vpc.existing[0].id : aws_vpc.main[0].id
# Use external config or local defaults
instance_config = var.use_external_config ? {
type = jsondecode(data.http.external_config[0].response_body).instance_type
count = jsondecode(data.http.external_config[0].response_body).instance_count
} : {
type = var.instance_type
count = var.instance_count
}
}
# Resources that may or may not be created
resource "aws_vpc" "main" {
count = var.use_existing_vpc ? 0 : 1
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "${var.project_name}-vpc"
}
}
# Resources using conditional data
resource "aws_instance" "app" {
count = local.instance_config.count
ami = data.aws_ami.ubuntu.id
instance_type = local.instance_config.type
subnet_id = aws_subnet.app[count.index % length(aws_subnet.app)].id
tags = {
Name = "${var.project_name}-app-${count.index + 1}"
ConfigSource = var.use_external_config ? "external" : "local"
}
}
Security and Best Practices
Secure API Access
# variables.tf
variable "api_credentials" {
description = "API credentials for external services"
type = object({
token = string
secret = string
endpoint = string
})
sensitive = true
}
# Secure HTTP data source
data "http" "secure_config" {
url = var.api_credentials.endpoint
request_headers = {
Accept = "application/json"
Authorization = "Bearer ${var.api_credentials.token}"
X-API-Secret = var.api_credentials.secret
User-Agent = "Terraform-${random_id.session.hex}"
}
# Add retry logic for reliability
retry {
attempts = 3
min_delay_ms = 1000
max_delay_ms = 5000
}
}
resource "random_id" "session" {
byte_length = 8
}
# Validate external data
locals {
external_data = jsondecode(data.http.secure_config.response_body)
# Validate required fields exist
validated_config = {
instance_type = lookup(local.external_data, "instance_type", "t3.micro")
instance_count = max(1, min(10, lookup(local.external_data, "instance_count", 1)))
allowed_cidrs = [
for cidr in lookup(local.external_data, "allowed_cidrs", ["10.0.0.0/8"]) :
cidr if can(cidrhost(cidr, 0))
]
}
}
Error Handling and Fallbacks
# External data with error handling
data "external" "service_discovery" {
program = ["bash", "${path.module}/scripts/discover_services.sh"]
query = {
environment = var.environment
region = var.aws_region
timeout = "30"
}
}
# Script with error handling: scripts/discover_services.sh
# #!/bin/bash
# set -e
#
# # Parse input
# eval "$(jq -r '@sh "ENV=\(.environment) REGION=\(.region) TIMEOUT=\(.timeout)"')"
#
# # Function to handle errors
# error_exit() {
# echo '{"services": "[]", "error": "'"$1"'", "fallback": "true"}' >&2
# exit 1
# }
#
# # Try to discover services with timeout
# timeout $TIMEOUT aws elbv2 describe-load-balancers \
# --region $REGION \
# --query 'LoadBalancers[?contains(Tags[?Key==`Environment`].Value, `'$ENV'`)].LoadBalancerName' \
# --output json > /tmp/services.json 2>/dev/null || error_exit "Failed to discover services"
#
# # Return results
# SERVICES=$(cat /tmp/services.json)
# jq -n --argjson services "$SERVICES" '{
# services: ($services | tostring),
# count: ($services | length | tostring),
# fallback: "false"
# }'
locals {
# Handle external data with fallbacks
discovery_result = data.external.service_discovery.result
services = lookup(discovery_result, "fallback", "false") == "true" ?
[] : jsondecode(discovery_result.services)
# Provide default configuration if external discovery fails
service_config = length(local.services) > 0 ? {
load_balancers = local.services
source = "discovered"
} : {
load_balancers = [var.default_load_balancer]
source = "default"
}
}
# HTTP data source with validation
data "http" "config_with_validation" {
url = var.config_api_url
request_headers = {
Accept = "application/json"
}
lifecycle {
postcondition {
condition = contains([200, 201], self.status_code)
error_message = "API returned status ${self.status_code}"
}
}
}
locals {
# Validate and parse external configuration
raw_config = can(jsondecode(data.http.config_with_validation.response_body)) ?
jsondecode(data.http.config_with_validation.response_body) : {}
# Apply validation and defaults
final_config = {
instance_type = contains(["t3.micro", "t3.small", "t3.medium"],
lookup(local.raw_config, "instance_type", "")) ?
local.raw_config.instance_type : "t3.micro"
instance_count = can(tonumber(lookup(local.raw_config, "instance_count", ""))) &&
tonumber(local.raw_config.instance_count) >= 1 &&
tonumber(local.raw_config.instance_count) <= 10 ?
tonumber(local.raw_config.instance_count) : 1
}
}
Key Takeaways
- Dynamic Configuration: Use data sources to make infrastructure responsive to external changes
- Error Handling: Always provide fallbacks and validation for external data
- Security: Secure API credentials and validate external data
- Dependencies: Chain data sources properly to ensure correct execution order
- Performance: Cache external data when possible and use timeouts
- Flexibility: Design configurations that work with or without external data
Next Steps
- Tutorial 22: Learn about built-in functions and expressions
- Practice integrating with various external APIs
- Experiment with complex data transformations
- Review the Terraform Data Sources Documentation