/ scripts / scraper / setup_ubuntu.sh
setup_ubuntu.sh
  1  #!/bin/bash
  2  # Setup script for News Scraper Daemon on Ubuntu
  3  # Run this script on your Ubuntu machine to set up the scraper
  4  
  5  set -e
  6  
  7  echo "========================================"
  8  echo "News Scraper Daemon - Ubuntu Setup"
  9  echo "========================================"
 10  
 11  # Colors for output
 12  RED='\033[0;31m'
 13  GREEN='\033[0;32m'
 14  YELLOW='\033[1;33m'
 15  NC='\033[0m' # No Color
 16  
 17  # Get the directory where this script is located
 18  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 19  REPO_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")"
 20  
 21  echo -e "${YELLOW}Repository root: $REPO_ROOT${NC}"
 22  
 23  # Check if running as root
 24  if [ "$EUID" -eq 0 ]; then
 25      echo -e "${RED}Please do not run this script as root${NC}"
 26      exit 1
 27  fi
 28  
 29  # Step 1: Install system dependencies
 30  echo ""
 31  echo -e "${GREEN}Step 1: Installing system dependencies...${NC}"
 32  sudo apt-get update
 33  sudo apt-get install -y python3 python3-pip python3-venv git
 34  
 35  # Step 2: Create virtual environment
 36  echo ""
 37  echo -e "${GREEN}Step 2: Setting up Python virtual environment...${NC}"
 38  cd "$REPO_ROOT"
 39  python3 -m venv venv
 40  source venv/bin/activate
 41  
 42  # Step 3: Install Python dependencies
 43  echo ""
 44  echo -e "${GREEN}Step 3: Installing Python dependencies...${NC}"
 45  pip install --upgrade pip
 46  pip install -r scripts/scraper/requirements.txt
 47  
 48  # Step 4: Install Playwright browsers
 49  echo ""
 50  echo -e "${GREEN}Step 4: Installing Playwright browsers...${NC}"
 51  playwright install chromium
 52  playwright install-deps chromium
 53  
 54  # Step 5: Set up logs directory
 55  echo ""
 56  echo -e "${GREEN}Step 5: Setting up logs directory...${NC}"
 57  mkdir -p logs
 58  touch logs/.gitkeep
 59  
 60  # Step 6: Configure git
 61  echo ""
 62  echo -e "${GREEN}Step 6: Configuring git...${NC}"
 63  git config user.name "News Scraper Bot"
 64  git config user.email "scraper@local"
 65  
 66  # Step 7: Configuration Setup
 67  echo ""
 68  echo -e "${YELLOW}Step 7: Configuration Setup${NC}"
 69  echo ""
 70  
 71  # Get Fork Repo
 72  echo "Enter your GitHub fork repository (e.g., 'username/Hong-Kong-Fire-Documentary')"
 73  read -p "Fork repo: " FORK_REPO
 74  
 75  if [ -z "$FORK_REPO" ]; then
 76      echo -e "${RED}Fork repo is required!${NC}"
 77      exit 1
 78  fi
 79  
 80  # Get GitHub Token
 81  echo ""
 82  echo "You need a GitHub Personal Access Token (PAT) with these permissions:"
 83  echo "  - Contents: Read and Write"
 84  echo "  - Pull requests: Read and Write"
 85  echo ""
 86  echo "Generate one at: https://github.com/settings/tokens?type=beta"
 87  echo ""
 88  
 89  read -p "Enter your GitHub token: " GITHUB_TOKEN
 90  
 91  if [ -z "$GITHUB_TOKEN" ]; then
 92      echo -e "${RED}GitHub token is required!${NC}"
 93      exit 1
 94  fi
 95  
 96  # Create environment file
 97  ENV_FILE="$HOME/.scraper_env"
 98  cat > "$ENV_FILE" << EOF
 99  GITHUB_TOKEN=$GITHUB_TOKEN
100  FORK_REPO=$FORK_REPO
101  EOF
102  chmod 600 "$ENV_FILE"
103  echo -e "${GREEN}Configuration saved to $ENV_FILE${NC}"
104  
105  # Export for current session
106  export GITHUB_TOKEN
107  export FORK_REPO
108  
109  # Step 8: Test the daemon
110  echo ""
111  echo -e "${GREEN}Step 8: Testing the daemon...${NC}"
112  echo "Running a single sync cycle..."
113  
114  cd "$REPO_ROOT"
115  source venv/bin/activate
116  python scripts/scraper/daemon.py --once
117  echo -e "${GREEN}Test completed!${NC}"
118  
119  # Step 9: Set up systemd service
120  echo ""
121  echo -e "${YELLOW}Step 9: systemd Service Setup${NC}"
122  echo ""
123  read -p "Do you want to install the systemd service? (y/n): " INSTALL_SERVICE
124  
125  if [ "$INSTALL_SERVICE" = "y" ]; then
126      # Create service file from template
127      SERVICE_FILE="/tmp/news-scraper.service"
128      
129      cat > "$SERVICE_FILE" << EOF
130  [Unit]
131  Description=Hong Kong Fire Documentary News Scraper Daemon
132  After=network-online.target
133  Wants=network-online.target
134  
135  [Service]
136  Type=simple
137  User=$USER
138  Group=$USER
139  WorkingDirectory=$REPO_ROOT
140  EnvironmentFile=$HOME/.scraper_env
141  ExecStart=$REPO_ROOT/venv/bin/python $REPO_ROOT/scripts/scraper/daemon.py
142  Restart=on-failure
143  RestartSec=30
144  StandardOutput=journal
145  StandardError=journal
146  
147  [Install]
148  WantedBy=multi-user.target
149  EOF
150  
151      # Install service
152      sudo cp "$SERVICE_FILE" /etc/systemd/system/news-scraper.service
153      sudo systemctl daemon-reload
154      sudo systemctl enable news-scraper
155      
156      echo ""
157      read -p "Start the service now? (y/n): " START_NOW
158      if [ "$START_NOW" = "y" ]; then
159          sudo systemctl start news-scraper
160          echo -e "${GREEN}Service started!${NC}"
161          echo ""
162          echo "View logs with: journalctl -u news-scraper -f"
163          echo "Check status with: sudo systemctl status news-scraper"
164      fi
165      
166      echo -e "${GREEN}Service installed!${NC}"
167  fi
168  
169  echo ""
170  echo "========================================"
171  echo -e "${GREEN}Setup Complete!${NC}"
172  echo "========================================"
173  echo ""
174  echo "To run the daemon manually:"
175  echo "  cd $REPO_ROOT"
176  echo "  source venv/bin/activate"
177  echo "  source ~/.scraper_env"
178  echo "  python scripts/scraper/daemon.py"
179  echo ""
180  echo "To check service status:"
181  echo "  sudo systemctl status news-scraper"
182  echo ""
183  echo "To view logs:"
184  echo "  journalctl -u news-scraper -f"
185  echo "  # or"
186  echo "  tail -f $REPO_ROOT/logs/scraper.log"
187  echo ""
188