Shell 文字處理工具是處理「繁瑣但非重複性」資料任務的最順手工具。這類工作不值得專門寫一個應用程式,但用 Shell 指令組合就能快速完成。TSV 檔案是常見的中間格式,之後再轉成 Excel。
Database 整合
MySQL
echo "SELECT * FROM users ..." | mysql --host=[host] --user=[usr] --password=[password] --default-character-set=utf8 [DB_NAME]
cat [sql file] | mysql --host=[host] --user=[usr] --password=[password] --default-character-set=utf8 [DB_NAME] | tee sql_result.tsv | head -n 10PostgreSQL
echo '[YOUR_SQL]'| PGPASSWORD=[YOUR_PASSWORD] psql -U [user] -h [host] -p [port]| head -n 10指令參考
Redirection: > 與 >>
$ ls -l ... > tmp.txt # Overwrite file
$ ls -l ... >> tmp.txt # Append to filecat: 顯示檔案
$ cat [file] # Display file contents
$ cat -n [file] # Show with line numbers
$ cat -s [file] # Suppress multiple blank lines
$ cat > [file] # Create/overwrite (Ctrl+D to exit)
$ cat >> [file] # Append (Ctrl+D to exit)pipe: | 資料串流
$ cmd1 | cmd2 | cmd3 | ...
# Example: paginate long output
$ ls -al / | lessfind: 搜尋檔案
$ find [path] [conditions] [actions] [options]
# Find all files in home directory
$ find ~ -type f
# Find PNG files
$ find ~ -type f -name '*.png'
# Complex conditions with OR
$ find ~ ( -type f -not -perm 0600 ) -or ( -type d -not -perm 0700 )
# Execute actions
$ find ~/Desktop -type f -name '*.png' -print | xargs ls -lhead, tail: 前幾行 / 後幾行
$ ls -al /bin/usr | head # First 10 lines
$ ls -al /bin/usr | head -n 5 # First 5 lines
$ ls -al /bin/usr | tail # Last 10 lines
$ ls -al /bin/usr | tail -n 5 # Last 5 lineswc: 字數 / 行數統計
$ wc ~/Desktop/event.js
# Output: lines words bytes filepath
300 757 8813 /Users/wangwilliam/Desktop/event.js
$ wc -l [file] # Line count only
$ ls -1 [path] | wc -l # Count files in directorytee: T 型管線輸出
# Read stdin, write to both stdout and file
$ ls -al /usr/bin | tee ~/Desktop/tmp.txt | lessgrep: 模式比對
$ grep [options] pattern file
# Options:
# -i : Case-insensitive
# -v : Invert match (non-matching lines)
# -c : Count matches
# -n : Show line numbers
# -E : Extended regex
$ ls -al | grep -E '^[a-zA-Zl]{3}'sort: 排序
# Sort by 5th field (file size), descending numeric
$ ls -al ~ | sort -nr -k 5
# Sort by column 3, character 7
$ sort -k 3.7 filename
# Options:
# -n : Numeric sort
# -r : Reverse sort
# -t : Field delimiter (default Tab)
# -k : Field to sortuniq: 去除重複
$ cat test.tsv | sort | uniq
# Note: uniq only removes adjacent duplicates; sort firstcut: 擷取欄位
# Extract fields 1 and 3 (tab-delimited)
$ cat iris.data.tsv | cut -f 1,3 | head
# Using colon delimiter for /etc/passwd
$ cut -d ':' -f 1 /etc/passwd
$ cat /etc/passwd | cut -d ':' -f 1
# Extract by character positions
$ ls -l | cut -c 1-10paste: 合併檔案
# Rearrange columns by extracting and pasting files
$ cut -f 1 iris.data.tsv > iris.data.first.tsv
$ cut -f 5 iris.data.tsv > iris.data.fifth.tsv
$ cut -f 2,3,4 iris.data.tsv > iris.data.else.tsv
$ paste iris.data.fifth.tsv iris.data.else.tsv iris.data.first.tsvcomm: 比較兩個檔案
$ comm A.txt B.txt
# Three-column output: (only A) (only B) (both)
$ comm -1 A.txt B.txt # Only show columns 2,3
$ comm -2 A.txt B.txt # Only show columns 1,3
$ comm -3 A.txt B.txt # Only show columns 1,2diff, patch: 差異比對
# Diff operations
$ diff file1 file2
$ diff -w file1 file2 # Ignore whitespace
$ diff -y file1 file2 # Side-by-side view
$ diff -r directory1 directory2 # Recursive directories
$ diff -rq directory1 directory2 # Only filenames
# Patch operations
$ patch < patch_file.diff
$ patch -p1 < patch_file.diff
$ patch -R < patch_file.diff # Reverse patchsplit: 分割大檔案
$ split -l 10 filename # 10 lines per file
$ split -n 5 filename # Split into 5 equal files
$ split -C 512 filename # 512 bytes max per filetr: 字元轉換
$ tr find_characters replace_characters < filename
$ tr 'abcd' 'jkmn' < filename
$ tr -d 'input_characters' < filename # Delete characters
$ tr -s 'input_characters' < filename # Compress duplicates
$ tr "[:lower:]" "[:upper:]" < filename # Uppercase
$ tr -cd "[:print:]" < filename # Remove non-printablesed: 串流編輯器
$ sed 's/find/replace/' filename # Replace first
$ sed -E 's/regex/replace/g' filename # Extended regex, global
$ sed -i '' 's/find/replace/g' filename # In-place edit
$ sed '/line_pattern/s/find/replace/' filename # Conditional replace
$ sed -e 's/find/replace/' -e 's/find/replace/' filename
$ sed 's#find#replace#' filename # Alternative delimiterawk: 文字處理
$ awk '{print $5}' filename # Print 5th column
$ awk '/something/ {print $2}' filename # Conditional print
$ awk -F ',' '{print $NF}' filename # Last column, comma-delimited
$ awk '{s+=$1} END {print s}' filename # Sum first column
$ awk '{s+=$1; print $1} END {print "--------"; print s}' filenamexargs: 建立參數
$ arguments_source | xargs command
# Delete backup files
$ find . -name '*.backup' | xargs rm -v
# Handle spaces in filenames
$ arguments_source | tr '
' '